diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index cbd6f64976d21..920a47b5afe07 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1839,6 +1839,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the subtarget requires a wait for xcnt before atomic /// flat/global stores & rmw. bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } + + /// \returns the number of significant bits in the immediate field of the + /// S_NOP instruction. + unsigned getSNopBits() const { + if (getGeneration() >= AMDGPUSubtarget::GFX12) + return 7; + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 4; + return 3; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5106478a95b43..ee3e8dc58b468 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1932,8 +1932,9 @@ void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const { DebugLoc DL = MBB.findDebugLoc(MI); + unsigned MaxSNopCount = 1u << ST.getSNopBits(); while (Quantity > 0) { - unsigned Arg = std::min(Quantity, 8u); + unsigned Arg = std::min(Quantity, MaxSNopCount); Quantity -= Arg; BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll index 393a462954003..5720b882f4e73 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -58,8 +58,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 @@ -109,8 +108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 9 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -185,8 +183,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -220,8 +217,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 9 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm bb: @@ -277,8 +273,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 @@ -302,8 +297,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) % ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -336,8 +330,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -369,8 +362,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll index 002ccd6060681..635d2a2d16a76 100644 --- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll +++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -9,8 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] ; GCN-NOT: v_accvgpr_write ; GCN: v_mfma_f32_32x32x1f32 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 2 ; GCN-NOT: v_accvgpr_read ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] @@ -28,8 +27,7 @@ bb: ; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}] ; GCN-NOT: v_accvgpr_read ; GCN: v_mfma_f32_32x32x1f32 a[[[N:[0-9]+]]: -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 2 ; GCN-NOT: v_accvgpr_read ; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}] @@ -80,8 +78,7 @@ bb: ; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] ; GCN-COUNT-32: v_accvgpr_write ; GCN: v_mfma_f32_32x32x1f32 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 2 ; GCN-NOT: v_accvgpr_read ; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index c226dae3d64a9..9e240238c1066 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -63,8 +63,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a16, v39 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 9 ; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse @@ -181,8 +180,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill @@ -487,8 +485,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 { ; GFX90A-NEXT: ; copy ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_accvgpr_write_b32 a32, v35 ; Reload Reuse -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a2 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; use a3 v[0:31] @@ -965,8 +962,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a16, v39 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 9 ; GFX908-NEXT: v_accvgpr_read_b32 v39, a0 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v38, a11 ; Reload Reuse ; GFX908-NEXT: v_accvgpr_read_b32 v37, a12 ; Reload Reuse @@ -1084,8 +1080,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 { ; GFX90A-NEXT: v_accvgpr_read_b32 v34, a32 ; Reload Reuse ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31] -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: buffer_store_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir index 22c913496b734..b5c3e3214f125 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-snop-padding.mir @@ -63,52 +63,41 @@ body: | ; GCN16-NEXT: successors: %bb.1(0x80000000) ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN16-NEXT: {{ $}} - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_BRANCH %bb.1 ; GCN16-NEXT: {{ $}} ; GCN16-NEXT: bb.1: ; GCN16-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN16-NEXT: {{ $}} - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 10, implicit $exec - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec ; GCN16-NEXT: {{ $}} ; GCN16-NEXT: bb.2: ; GCN16-NEXT: successors: %bb.3(0x80000000) ; GCN16-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 ; GCN16-NEXT: {{ $}} - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_NOP 0 - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 20, implicit $exec - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_BRANCH %bb.3 ; GCN16-NEXT: {{ $}} ; GCN16-NEXT: bb.3: ; GCN16-NEXT: liveins: $sgpr10_sgpr11 ; GCN16-NEXT: {{ $}} - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: $sgpr5 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 - ; GCN16-NEXT: S_NOP 7 - ; GCN16-NEXT: S_NOP 7 + ; GCN16-NEXT: S_NOP 15 ; GCN16-NEXT: SI_RETURN bb.0: liveins: $sgpr6, $sgpr10_sgpr11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll index 303ea50dc16cc..12a998ad82cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -87,8 +87,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v0, 2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -191,8 +190,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x2bf16 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -256,8 +254,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v1, 2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 9 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 @@ -308,8 +305,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -424,8 +420,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v1, 2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 @@ -476,8 +471,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -513,8 +507,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 9 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 @@ -538,8 +531,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x8bf16 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll index ff77d5ccbe312..5ab8706f28f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -59,8 +59,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112 @@ -117,8 +116,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112 @@ -175,8 +173,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 2 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112 @@ -233,8 +230,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112 @@ -283,8 +279,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 @@ -319,8 +314,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: s_nop 10 ; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 @@ -347,8 +341,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 2 +; GFX90A-VGPR-NEXT: s_nop 10 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 @@ -375,8 +368,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 2 +; GFX942-VGPR-NEXT: s_nop 10 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 @@ -505,8 +497,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 @@ -542,8 +533,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 2 +; GFX942-NEXT: s_nop 10 ; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16 @@ -570,8 +560,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 2 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 @@ -599,8 +588,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 2 +; GFX942-VGPR-NEXT: s_nop 10 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16 @@ -632,8 +620,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; @@ -671,8 +658,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 2 +; GFX90A-VGPR-NEXT: s_nop 10 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX90A-VGPR-NEXT: s_endpgm ; @@ -795,8 +781,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] @@ -823,8 +808,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] @@ -847,8 +831,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] @@ -871,8 +854,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] @@ -896,8 +878,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -914,8 +895,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -932,8 +912,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -950,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1) ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -975,8 +953,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -993,8 +970,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1011,8 +987,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1029,8 +1004,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1054,8 +1028,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1072,8 +1045,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1090,8 +1062,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1108,8 +1079,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1) ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1133,8 +1103,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1151,8 +1120,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1169,8 +1137,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1187,8 +1154,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1212,8 +1178,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1230,8 +1195,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1248,8 +1212,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1266,8 +1229,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1299,8 +1261,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1325,8 +1286,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1354,8 +1314,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] @@ -1383,8 +1342,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0] -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] @@ -1416,8 +1374,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1442,8 +1399,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1468,8 +1424,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1494,8 +1449,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1527,8 +1481,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_ ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1553,8 +1506,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_ ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1579,8 +1531,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_ ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3 ; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 0 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1605,8 +1556,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_ ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] @@ -1639,8 +1589,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1666,8 +1615,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1695,8 +1643,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] @@ -1724,8 +1671,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] @@ -1757,8 +1703,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1784,8 +1729,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] @@ -1813,8 +1757,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9] -; GFX90A-VGPR-NEXT: s_nop 7 -; GFX90A-VGPR-NEXT: s_nop 7 +; GFX90A-VGPR-NEXT: s_nop 15 ; GFX90A-VGPR-NEXT: s_nop 1 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX90A-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] @@ -1842,8 +1785,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9] -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index beda16c17a5c9..dc4c929124fec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -31,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32) define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -73,47 +73,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: -; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -135,7 +114,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm -; +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -193,8 +191,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -230,8 +227,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -267,8 +263,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: s_nop 2 +; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -304,8 +299,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 { ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 7 -; GFX950-GISEL-NEXT: s_nop 2 +; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -319,26 +313,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -361,47 +355,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: -; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -423,7 +396,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm -; +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -452,26 +444,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -494,47 +486,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: -; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -556,7 +527,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm -; +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -585,26 +575,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -627,47 +617,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: -; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -689,7 +658,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm -; +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -718,26 +706,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -760,47 +748,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: -; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -822,7 +789,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm -; +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 @@ -880,8 +866,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -917,8 +902,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -954,8 +938,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: s_nop 2 +; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -991,8 +974,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 7 -; GFX950-GISEL-NEXT: s_nop 2 +; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1035,8 +1017,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1072,8 +1053,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1109,8 +1089,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: s_nop 2 +; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1146,8 +1125,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 7 -; GFX950-GISEL-NEXT: s_nop 2 +; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1190,8 +1168,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1227,8 +1204,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1264,8 +1240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: s_nop 2 +; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1301,8 +1276,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 7 -; GFX950-GISEL-NEXT: s_nop 2 +; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1345,8 +1319,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1382,8 +1355,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1419,8 +1391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: s_nop 2 +; GFX950-SDAG-NEXT: s_nop 10 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1456,8 +1427,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-GISEL-NEXT: s_nop 7 -; GFX950-GISEL-NEXT: s_nop 2 +; GFX950-GISEL-NEXT: s_nop 10 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1471,46 +1441,85 @@ bb: } define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX942-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX942-AGPRCD: ; %bb.0: ; %bb ; GFX942-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -1533,47 +1542,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-AGPRCD-NEXT: s_nop 5 ; GFX942-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] ; GFX942-AGPRCD-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: ; GFX950-AGPRCD: ; %bb.0: ; %bb ; GFX950-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -1604,66 +1572,121 @@ bb: } define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -1701,7 +1724,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -1739,67 +1761,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -1837,7 +1798,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, < ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -1883,46 +1843,85 @@ bb: } define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX942-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX942-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s6 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX950-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s6 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX942-AGPRCD: ; %bb.0: ; %bb ; GFX942-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -1945,47 +1944,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-AGPRCD-NEXT: s_nop 5 ; GFX942-AGPRCD-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] ; GFX942-AGPRCD-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: ; GFX950-AGPRCD: ; %bb.0: ; %bb ; GFX950-AGPRCD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 @@ -2016,66 +1974,121 @@ bb: } define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-SDAG-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; +; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -2113,7 +2126,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -2151,67 +2163,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[24:25] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[24:25] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s24, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[18:19] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s24 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -2249,7 +2200,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[24:25] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[24:25] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 @@ -2295,53 +2245,99 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2367,7 +2363,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-AGPRCD-SDAG-NEXT: s_nop 5 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -2398,54 +2393,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-AGPRCD-GISEL-NEXT: s_nop 5 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2471,7 +2418,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-AGPRCD-SDAG-NEXT: s_nop 6 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -2510,73 +2456,135 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2618,7 +2626,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2661,74 +2668,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2770,7 +2709,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2821,53 +2759,99 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2893,7 +2877,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: s_nop 5 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -2924,54 +2907,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: s_nop 5 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -2997,7 +2932,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: s_nop 6 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3036,53 +2970,99 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3108,7 +3088,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: s_nop 5 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3139,54 +3118,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: s_nop 5 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3212,7 +3143,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: s_nop 6 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3251,53 +3181,99 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3323,7 +3299,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: s_nop 5 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3354,54 +3329,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: s_nop 5 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3427,7 +3354,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: s_nop 6 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3466,53 +3392,99 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: s_nop 6 +; GFX942-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX942-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-GISEL-NEXT: s_nop 5 +; GFX942-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s12 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: s_nop 7 +; GFX950-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s4, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s14 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 6 +; GFX950-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3538,7 +3510,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: s_nop 5 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3569,54 +3540,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: s_nop 5 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3642,7 +3565,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: s_nop 6 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c @@ -3681,73 +3603,135 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3789,7 +3773,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3832,74 +3815,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3941,7 +3856,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -3992,73 +3906,135 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4100,7 +4076,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4143,74 +4118,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4252,7 +4159,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4303,73 +4209,135 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4411,7 +4379,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4454,74 +4421,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4563,7 +4462,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4614,73 +4512,135 @@ bb: } define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 { -; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: -; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-SDAG-NEXT: s_endpgm +; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: +; GFX942-SDAG: ; %bb.0: ; %bb +; GFX942-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX942-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-SDAG-NEXT: s_nop 9 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-SDAG-NEXT: s_endpgm +; +; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: +; GFX942-GISEL: ; %bb.0: ; %bb +; GFX942-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX942-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX942-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX942-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX942-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX942-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX942-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-GISEL-NEXT: s_nop 9 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX942-GISEL-NEXT: s_endpgm ; -; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: -; GFX942-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX942-VGPRCD-GISEL-NEXT: s_endpgm +; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: +; GFX950-SDAG: ; %bb.0: ; %bb +; GFX950-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c +; GFX950-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s20 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s22 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-SDAG-NEXT: s_nop 10 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: +; GFX950-GISEL: ; %bb.0: ; %bb +; GFX950-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; GFX950-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c +; GFX950-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] +; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s20, s18 +; GFX950-GISEL-NEXT: s_mov_b32 s21, s19 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, s26 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0 +; GFX950-GISEL-NEXT: s_nop 10 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 +; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 +; GFX950-GISEL-NEXT: s_endpgm ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: ; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4722,7 +4682,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: ; GFX942-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX942-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4765,74 +4724,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-AGPRCD-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-AGPRCD-GISEL-NEXT: s_endpgm -; -; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: -; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 -; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 -; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-SDAG-NEXT: s_endpgm -; -; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: -; GFX950-VGPRCD-GISEL: ; %bb.0: ; %bb -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x3c -; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s26, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17] -; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s20, s18 -; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s21, s19 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[20:21] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v22, s26 -; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[22:23] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 7 -; GFX950-VGPRCD-GISEL-NEXT: s_nop 2 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[24:25] -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[24:25] offset:16 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[24:25] offset:32 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[24:25] offset:48 -; GFX950-VGPRCD-GISEL-NEXT: s_endpgm -; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: ; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4874,7 +4765,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX950-AGPRCD-SDAG-NEXT: s_endpgm -; ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8: ; GFX950-AGPRCD-GISEL: ; %bb.0: ; %bb ; GFX950-AGPRCD-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 @@ -4928,5 +4818,9 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX942: {{.*}} ; GFX942-VGPRCD: {{.*}} +; GFX942-VGPRCD-GISEL: {{.*}} +; GFX942-VGPRCD-SDAG: {{.*}} ; GFX950: {{.*}} ; GFX950-VGPRCD: {{.*}} +; GFX950-VGPRCD-GISEL: {{.*}} +; GFX950-VGPRCD-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 284ced1727b7e..033a35f69a0bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -178,8 +178,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -223,8 +222,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -394,8 +392,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] ; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 10 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -428,8 +425,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 10 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 078a043b94604..753206206180a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -479,8 +479,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_nop 8 ; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 @@ -598,8 +597,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: s_nop 8 ; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 @@ -864,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_nop 8 ; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 @@ -983,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; VGPRRC-NEXT: v_mov_b32_e32 v50, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v51, s19 ; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: s_nop 8 ; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 @@ -1169,8 +1165,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1210,8 +1205,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: s_nop 11 ; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 ; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 ; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1234,8 +1228,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 @@ -1342,8 +1335,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1383,8 +1375,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: s_nop 11 ; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 ; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 ; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1407,8 +1398,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 @@ -2199,8 +2189,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -2228,8 +2217,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2257,8 +2245,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] ; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: s_nop 10 ; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -2286,8 +2273,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] ; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: s_nop 10 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -2384,8 +2370,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -2413,8 +2398,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2442,8 +2426,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: s_nop 10 ; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -2471,8 +2454,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: s_nop 10 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -3083,8 +3065,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_nop 8 ; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 @@ -3205,8 +3186,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 @@ -3497,8 +3477,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: s_nop 8 ; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1 @@ -3619,8 +3598,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1 @@ -3827,8 +3805,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3868,8 +3845,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: s_nop 11 ; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 ; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 ; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3892,8 +3868,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 @@ -4000,8 +3975,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4041,8 +4015,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: s_nop 11 ; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 ; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 ; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -4065,8 +4038,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: s_nop 11 ; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 ; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 ; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 @@ -4932,8 +4904,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4961,8 +4932,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -4995,8 +4965,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] ; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: s_nop 10 ; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5029,8 +4998,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] ; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: s_nop 10 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5142,8 +5110,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5171,8 +5138,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -5205,8 +5171,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_nop 1 ; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: v_mov_b32_e32 v16, 0 -; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: s_nop 10 ; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5239,8 +5204,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; VGPRRC-NEXT: s_nop 1 ; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 -; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: s_nop 10 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll index 856185b17e5fd..d24f1f0b526c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll @@ -50,8 +50,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_mov_b32_e32 v1, 2 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 @@ -103,8 +102,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -138,8 +136,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: s_nop 9 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 @@ -163,8 +160,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index c1508c1675fe0..7e30af96bb8b9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -97,8 +97,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -233,8 +232,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -337,8 +335,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -394,8 +391,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -451,8 +447,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 @@ -514,8 +509,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -582,8 +576,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -634,8 +627,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -669,8 +661,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -696,8 +687,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -872,8 +862,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -940,8 +929,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -992,8 +980,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1028,8 +1015,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -1056,8 +1042,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 @@ -1091,8 +1076,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1120,8 +1104,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1145,8 +1128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; @@ -1165,8 +1147,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 9 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; @@ -1183,8 +1164,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: @@ -1275,8 +1255,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -1415,8 +1394,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -1523,8 +1501,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 @@ -1584,8 +1561,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: v_mov_b32_e32 v5, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 @@ -1645,8 +1621,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112 @@ -1714,8 +1689,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -1785,8 +1759,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -1840,8 +1813,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1878,8 +1850,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 9 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -1908,8 +1879,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -2108,8 +2078,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -2179,8 +2148,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -2234,8 +2202,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -2273,8 +2240,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 9 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -2303,8 +2269,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -2343,8 +2308,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2375,8 +2339,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2403,8 +2366,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: s_nop 10 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; @@ -2536,8 +2498,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 @@ -2658,8 +2619,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 @@ -2748,8 +2708,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -2805,8 +2764,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -2862,8 +2820,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 @@ -2925,8 +2882,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 @@ -2993,8 +2949,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 @@ -3045,8 +3000,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -3080,8 +3034,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: s_nop 9 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -3107,8 +3060,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -3145,8 +3097,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 @@ -3177,8 +3128,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 @@ -3211,8 +3161,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: s_nop 8 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -3228,8 +3177,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -3244,8 +3192,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -3645,8 +3592,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -3782,8 +3728,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -3887,8 +3832,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -3945,8 +3889,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 @@ -4003,8 +3946,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 @@ -4068,8 +4010,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -4136,8 +4077,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -4188,8 +4128,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -4224,8 +4163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -4252,8 +4190,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -4502,8 +4439,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -4541,8 +4477,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -4578,8 +4513,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: s_nop 8 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -4610,8 +4544,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4649,8 +4582,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -4691,8 +4623,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 @@ -4730,8 +4661,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -4750,8 +4680,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -4768,8 +4697,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 1 +; GFX942-VGPR-NEXT: s_nop 9 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -4821,8 +4749,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 @@ -4889,8 +4816,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 @@ -4948,8 +4874,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 @@ -4970,8 +4895,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -4990,8 +4914,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) % ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 @@ -5131,8 +5054,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 1 +; NOLIT-SRCC-NEXT: s_nop 9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -5186,8 +5108,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 1 +; LIT-SRCC-NEXT: s_nop 9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 @@ -5242,8 +5163,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -5274,8 +5194,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -5304,8 +5223,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 0 +; GFX942-VGPR-NEXT: s_nop 8 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5357,8 +5275,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 @@ -5457,8 +5374,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 @@ -5558,8 +5474,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 @@ -5611,8 +5526,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 @@ -5679,8 +5593,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 @@ -5965,8 +5878,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 -; NOLIT-SRCC-NEXT: s_nop 7 -; NOLIT-SRCC-NEXT: s_nop 7 +; NOLIT-SRCC-NEXT: s_nop 15 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -6061,8 +5973,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 -; LIT-SRCC-NEXT: s_nop 7 -; LIT-SRCC-NEXT: s_nop 7 +; LIT-SRCC-NEXT: s_nop 15 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 @@ -6125,8 +6036,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 @@ -6156,8 +6066,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 @@ -6187,8 +6096,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg ; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0) ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-VGPR-NEXT: s_nop 7 -; GFX942-VGPR-NEXT: s_nop 7 +; GFX942-VGPR-NEXT: s_nop 15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 17ae6dd23b199..aae14c8cc87b3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -23,8 +23,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -47,8 +46,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -71,8 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -95,8 +92,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -119,8 +115,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -143,8 +138,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -167,8 +161,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -191,8 +184,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -216,8 +208,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -241,8 +232,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -266,8 +256,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -291,8 +280,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -316,8 +304,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -341,8 +328,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -366,8 +352,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -391,8 +376,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -416,8 +400,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -441,8 +424,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -466,8 +448,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -491,8 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -517,8 +497,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -542,8 +521,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -566,8 +544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -591,8 +568,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -616,8 +592,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -641,8 +616,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -666,8 +640,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -691,8 +664,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -716,8 +688,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -741,8 +712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -766,8 +736,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -888,8 +857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -913,8 +881,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -938,8 +905,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -963,8 +929,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1180,8 +1145,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1205,8 +1169,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1230,8 +1193,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1255,8 +1217,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a3, v15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1429,8 +1390,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: v_mov_b32_e32 v17, s1 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1451,8 +1411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1473,8 +1432,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: v_mov_b32_e32 v16, s0 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1512,8 +1470,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_write_b32 a3, v1 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1543,8 +1500,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_write_b32 a3, v1 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1573,8 +1529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1599,8 +1554,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1629,8 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1655,8 +1608,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1685,8 +1637,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1711,8 +1662,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1733,8 +1683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1763,8 +1712,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v9, s24 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1789,8 +1737,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v9, s24 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1812,8 +1759,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1831,8 +1777,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__ ; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1854,8 +1799,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1873,8 +1817,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v17, -2 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1896,8 +1839,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1915,8 +1857,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1958,8 +1899,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-NEXT: v_mov_b32_e32 v22, s13 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; @@ -1983,8 +1923,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) @@ -2022,8 +1961,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2048,8 +1986,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) @@ -2087,8 +2024,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2113,8 +2049,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) @@ -2152,8 +2087,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2178,8 +2112,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) @@ -2217,8 +2150,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -2243,8 +2175,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; GISEL-NEXT: v_mov_b32_e32 v4, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) @@ -2263,8 +2194,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2285,8 +2215,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2308,8 +2237,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2327,8 +2255,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 ; GISEL-NEXT: v_mov_b32_e32 v17, 1 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2350,8 +2277,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; SDAG-NEXT: v_accvgpr_write_b32 a3, v19 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2369,8 +2295,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a( ; GISEL-NEXT: v_mov_b32_e32 v17, 0 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2394,8 +2319,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2418,8 +2342,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2488,8 +2411,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2512,8 +2434,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v19 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2536,8 +2457,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2560,8 +2480,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: v_accvgpr_write_b32 a3, v17 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index 839f0324227ca..f0205a3a788ed 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -38,8 +38,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -83,8 +82,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -135,8 +133,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -180,8 +177,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -232,8 +228,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -277,8 +272,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -329,8 +323,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -374,8 +367,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -426,8 +418,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -471,8 +462,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -523,8 +513,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -568,8 +557,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -620,8 +608,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -665,8 +652,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -717,8 +703,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -762,8 +747,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -813,8 +797,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -866,8 +849,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -911,8 +893,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -961,8 +942,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1013,8 +993,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1062,8 +1041,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1114,8 +1092,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1163,8 +1140,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1213,8 +1189,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1262,8 +1237,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1315,8 +1289,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1360,8 +1333,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1410,8 +1382,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1463,8 +1434,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1508,8 +1478,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1559,8 +1528,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1611,8 +1579,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1660,8 +1627,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1712,8 +1678,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1761,8 +1726,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1811,8 +1775,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1860,8 +1823,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1912,8 +1874,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -1961,8 +1922,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2013,8 +1973,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2062,8 +2021,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2112,8 +2070,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2160,8 +2117,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2209,8 +2165,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2257,8 +2212,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2309,8 +2263,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2358,8 +2311,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2410,8 +2362,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2459,8 +2410,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v29 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2509,8 +2459,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2557,8 +2506,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2606,8 +2554,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2654,8 +2601,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2703,8 +2649,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2751,8 +2696,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2800,8 +2744,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2848,8 +2791,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2897,8 +2839,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2946,8 +2887,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -2996,8 +2936,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3045,8 +2984,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v27 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3095,8 +3033,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3143,8 +3080,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3192,8 +3128,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3240,8 +3175,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v25 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3289,8 +3223,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3337,8 +3270,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons ; GCN-NEXT: v_accvgpr_write_b32 a15, v23 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -3392,8 +3324,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3441,8 +3372,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3490,8 +3420,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3554,8 +3483,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3610,8 +3538,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr ; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3665,8 +3592,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3716,8 +3642,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3771,8 +3696,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3822,8 +3746,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3877,8 +3800,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; SDAG-NEXT: v_mov_b32_e32 v8, s20 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3928,8 +3850,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp ; GISEL-NEXT: v_mov_b32_e32 v8, s20 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -3975,8 +3896,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp ; GCN-NEXT: v_mov_b32_e32 v17, s28 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4039,8 +3959,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; SDAG-NEXT: v_accvgpr_write_b32 a15, v13 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4099,8 +4018,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp ; GISEL-NEXT: v_accvgpr_write_b32 a15, v13 ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4148,8 +4066,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4193,8 +4110,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4242,8 +4158,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4287,8 +4202,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4336,8 +4250,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4381,8 +4294,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4430,8 +4342,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4475,8 +4386,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4524,8 +4434,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4569,8 +4478,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4618,8 +4526,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4663,8 +4570,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -4723,8 +4629,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 @@ -4759,8 +4664,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 @@ -4808,8 +4712,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -4844,8 +4747,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 @@ -5191,8 +5093,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 14 ; SDAG-NEXT: v_mov_b32_e32 v16, s20 ; SDAG-NEXT: v_mov_b32_e32 v17, s21 ; SDAG-NEXT: v_mov_b32_e32 v18, s22 @@ -5440,8 +5341,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5487,8 +5387,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5536,8 +5435,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5581,8 +5479,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5630,8 +5527,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5675,8 +5571,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5728,8 +5623,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5773,8 +5667,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5825,8 +5718,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5870,8 +5762,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -5922,8 +5813,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -5966,8 +5856,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6015,8 +5904,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6066,8 +5954,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6111,8 +5998,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6163,8 +6049,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 15 ; SDAG-NEXT: s_nop 3 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6208,8 +6093,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: s_nop 15 ; GISEL-NEXT: s_nop 3 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6259,8 +6143,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6310,8 +6193,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8( ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 @@ -6362,8 +6244,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6406,8 +6287,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4( ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 11 ; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 ; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 ; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 @@ -6455,8 +6335,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index 198cac5834d1f..5475fa2ae5c6e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -133,8 +133,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 @@ -172,8 +171,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-SDAG-NEXT: s_nop 1 ; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-SDAG-NEXT: s_nop 7 -; GFX942-SDAG-NEXT: s_nop 1 +; GFX942-SDAG-NEXT: s_nop 9 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 @@ -239,8 +236,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32_vgprcd(ptr addrspace(1) %ar ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GFX942-GISEL-NEXT: s_nop 7 -; GFX942-GISEL-NEXT: s_nop 1 +; GFX942-GISEL-NEXT: s_nop 9 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll index af26e7adae713..bc72687e260e7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll @@ -28,8 +28,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112 ; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96 ; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80 @@ -51,8 +50,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304 @@ -75,8 +73,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496 @@ -99,8 +96,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688 @@ -123,8 +119,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880 @@ -159,8 +154,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96 @@ -184,8 +178,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304 @@ -208,8 +201,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496 @@ -233,8 +225,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688 @@ -257,8 +248,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864 ; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880 @@ -293,8 +283,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96 @@ -315,8 +304,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304 @@ -336,8 +324,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688 ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 ; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400 ; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168 @@ -358,8 +345,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592 ; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360 @@ -383,8 +369,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 ; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880 @@ -488,8 +473,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96 @@ -513,8 +497,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304 @@ -539,8 +522,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3 ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 1 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480 @@ -563,8 +545,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672 @@ -587,8 +568,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MINREG-NEXT: s_nop 7 -; GCN-MINREG-NEXT: s_nop 7 +; GCN-MINREG-NEXT: s_nop 15 ; GCN-MINREG-NEXT: s_nop 2 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880 ; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864 @@ -623,8 +603,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3 ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96 @@ -648,8 +627,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304 @@ -673,8 +651,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480 @@ -698,8 +675,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 1 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672 @@ -722,8 +698,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-MAXOCC-NEXT: s_nop 7 -; GCN-MAXOCC-NEXT: s_nop 7 +; GCN-MAXOCC-NEXT: s_nop 15 ; GCN-MAXOCC-NEXT: s_nop 2 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880 ; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864 @@ -758,8 +733,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2 ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] ; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192 @@ -783,8 +757,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288 ; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304 @@ -808,8 +781,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 ; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496 ; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480 @@ -830,8 +802,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0) ; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3 -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 1 ; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576 ; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344 @@ -855,8 +826,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) -; GCN-ILP-NEXT: s_nop 7 -; GCN-ILP-NEXT: s_nop 7 +; GCN-ILP-NEXT: s_nop 15 ; GCN-ILP-NEXT: s_nop 2 ; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880 ; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 5b877f5a2bbb7..aa099b60ef16d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -678,8 +678,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 4 +; GCN-NEXT: s_nop 12 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -785,8 +784,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95] ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 4 +; EXACTCUTOFF-NEXT: s_nop 12 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[156:159] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[152:155] offset:96 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[148:151] offset:80 @@ -890,8 +888,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: v_add_u32_e32 v0, s1, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:96 @@ -915,8 +912,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:8304 @@ -939,8 +935,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 2 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:16480 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:16496 @@ -964,8 +959,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:24672 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:24688 @@ -988,8 +982,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 15 ; GCN-NEXT: s_nop 2 ; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 @@ -1024,8 +1017,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 15 ; EXACTCUTOFF-NEXT: s_nop 1 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:112 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:96 @@ -1049,8 +1041,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 15 ; EXACTCUTOFF-NEXT: s_nop 1 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:8304 @@ -1073,8 +1064,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 15 ; EXACTCUTOFF-NEXT: s_nop 2 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:16480 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:16496 @@ -1098,8 +1088,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 15 ; EXACTCUTOFF-NEXT: s_nop 1 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:24672 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:24688 @@ -1122,8 +1111,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) -; EXACTCUTOFF-NEXT: s_nop 7 -; EXACTCUTOFF-NEXT: s_nop 7 +; EXACTCUTOFF-NEXT: s_nop 15 ; EXACTCUTOFF-NEXT: s_nop 2 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[24:27] offset:32864 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[28:31] offset:32880 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index b25fe8392a60e..6eb9449069a52 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -199,8 +199,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -232,8 +231,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) % ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 @@ -253,8 +251,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -316,8 +313,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -379,8 +375,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -471,8 +466,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -685,8 +679,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 10 ; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -706,8 +699,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_mov_b32_e32 v0, v12 ; GCN-NEXT: v_mov_b32_e32 v1, v13 ; GCN-NEXT: v_mov_b32_e32 v2, v14 @@ -734,8 +726,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, < ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_mov_b32_e32 v0, v12 ; GCN-NEXT: v_mov_b32_e32 v1, v13 ; GCN-NEXT: v_mov_b32_e32 v2, v14 @@ -762,8 +753,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, < ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_mov_b32_e32 v0, v12 ; GCN-NEXT: v_mov_b32_e32 v1, v13 ; GCN-NEXT: v_mov_b32_e32 v2, v14 @@ -819,8 +809,7 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg ; GCN-NEXT: v_mov_b32_e32 v27, v9 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10 -; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 11 ; GCN-NEXT: v_mov_b32_e32 v0, v12 ; GCN-NEXT: v_mov_b32_e32 v1, v13 ; GCN-NEXT: v_mov_b32_e32 v2, v14 @@ -1049,8 +1038,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -1082,8 +1070,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -1103,8 +1090,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -1166,8 +1152,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -1229,8 +1214,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -1321,8 +1305,7 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2098,8 +2081,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -2131,8 +2113,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2152,8 +2133,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2215,8 +2195,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2278,8 +2257,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2370,8 +2348,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2471,8 +2448,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -2504,8 +2480,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2525,8 +2500,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2588,8 +2562,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2651,8 +2624,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2743,8 +2715,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2844,8 +2815,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -2877,8 +2847,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -2898,8 +2867,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -2961,8 +2929,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3024,8 +2991,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3116,8 +3082,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3217,8 +3182,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: s_nop 10 ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] @@ -3250,8 +3214,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 2 +; GISEL-NEXT: s_nop 10 ; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 @@ -3271,8 +3234,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3334,8 +3296,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3397,8 +3358,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, < ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 @@ -3489,8 +3449,7 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ; SDAG-NEXT: v_mov_b32_e32 v27, v9 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 11 ; SDAG-NEXT: v_mov_b32_e32 v0, v12 ; SDAG-NEXT: v_mov_b32_e32 v1, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v14 diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir index 3feccff715bc1..ddd8a4784ea86 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir @@ -125,8 +125,7 @@ body: | ... # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: V_MFMA name: sgemm32x32_mfma_write_agpr_mfma_read_overlap body: | @@ -136,8 +135,7 @@ body: | ... # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: V_MFMA name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap body: | @@ -147,8 +145,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap body: | @@ -196,8 +193,7 @@ body: | ... # GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: V_MFMA name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap body: | @@ -207,8 +203,7 @@ body: | ... # GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap @@ -249,8 +244,7 @@ body: | ... # GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -260,8 +254,7 @@ body: | ... # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap @@ -312,8 +305,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap body: | @@ -333,8 +325,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap body: | @@ -384,8 +375,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap body: | @@ -435,8 +425,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: FLAT_STORE_DWORD name: smfma16x16_write_vgpr_flat_read body: | @@ -446,8 +435,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: FLAT_STORE_DWORD name: smfma32x32_write_vgpr_flat_read @@ -458,8 +446,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_vgpr_flat_read_overlap body: | @@ -469,8 +456,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_vgpr_flat_read_full body: | @@ -480,8 +466,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma16x16_write_vgpr_flat_read @@ -502,8 +487,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32 name: smfma16x16_write_vgpr_valu_read body: | @@ -513,8 +497,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_MOV_B32 name: smfma32x32_write_vgpr_valu_read @@ -535,8 +518,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32 name: dmfma16x16_write_vgpr_valu_read body: | @@ -556,8 +538,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_accv_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: smfma16x16_write_vgpr_accv_read body: | @@ -567,8 +548,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_accv_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: smfma32x32_write_vgpr_accv_read @@ -599,8 +579,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_DOT name: dmfma16x16_write_vgpr_dot_read body: | @@ -620,8 +599,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32 name: smfma16x16_write_vgpr_valu_write body: | @@ -631,8 +609,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_MOV_B32 name: smfma32x32_write_vgpr_valu_write @@ -653,8 +630,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_FMA_F16_e64 name: smfma16x16_write_vgpr_valu_f16_write body: | @@ -664,8 +640,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_FMA_F16_e64 name: smfma32x32_write_vgpr_valu_f16_write @@ -686,8 +661,7 @@ body: | ... # GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32_sdwa name: smfma16x16_write_vgpr_valu_sdwa_write body: | @@ -697,8 +671,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 2 # GCN-NEXT: V_MOV_B32_sdwa name: smfma32x32_write_vgpr_valu_sdwa_write @@ -719,8 +692,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32 name: dmfma16x16_write_vgpr_valu_write body: | @@ -770,8 +742,7 @@ body: | ... # GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 6 +# GCN-NEXT: S_NOP 14 # GCN-NEXT: V_MOV_B32 name: smfma32x32_read_srcc_vgpr_valu_write body: | @@ -1040,8 +1011,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_read_overlap body: | @@ -1080,8 +1050,7 @@ body: | ... # GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: V_MFMA name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap body: | @@ -1091,8 +1060,7 @@ body: | ... # GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap @@ -1133,8 +1101,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -1154,8 +1121,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap body: | @@ -1185,8 +1151,7 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap body: | @@ -1196,8 +1161,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_agpr_flat_read_overlap body: | @@ -1207,8 +1171,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_agpr_flat_read_full body: | @@ -1218,8 +1181,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma16x16_write_agpr_flat_read @@ -1240,8 +1202,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_ACCVGPR_READ_B32_e64 name: dmfma16x16_write_agpr_valu_read body: | @@ -1261,8 +1222,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: dmfma16x16_write_agpr_valu_write body: | diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir index 8f4f57a5d37c5..1ef6b4c844c93 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx942.mir @@ -178,11 +178,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_overlap body: | @@ -192,11 +189,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_overlap body: | @@ -225,11 +219,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_SMFMAC name: xdl_sgemm16x16_mfma_write_agpr_smfmac_read_overlap body: | @@ -239,8 +230,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -252,8 +242,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -274,8 +263,7 @@ body: | ... # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: V_MFMA name: nonxdl_sgemm32x32_mfma_write_agpr_nonxdl_mfma_read_overlap body: | @@ -285,8 +273,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_smfmac_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC @@ -298,11 +285,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap @@ -323,11 +307,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap @@ -358,9 +339,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap body: | @@ -370,8 +350,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -383,9 +362,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_read_partial body: | @@ -395,9 +373,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_vgpr_mfma_read_partial body: | @@ -417,9 +394,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -429,8 +405,7 @@ body: | ... # GCN-LABEL: name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: nonxdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -440,9 +415,8 @@ body: | ... # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: smfmac32x32_write_agpr_mfma_srca_read_overlap body: | @@ -452,9 +426,8 @@ body: | ... # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_SMFMAC name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap body: | @@ -464,8 +437,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA @@ -477,8 +449,7 @@ body: | ... # GCN-LABEL: name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA name: nonxdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap @@ -539,11 +510,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap @@ -564,11 +532,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap @@ -639,11 +604,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap @@ -654,11 +616,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_SMFMAC name: dgemm16x16_mfma_write_vgpr_smfmac_srcb_read_overlap @@ -669,11 +628,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_smfmac_srcc_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_SMFMAC @@ -746,9 +702,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: FLAT_STORE_DWORD name: xdl_smfma16x16_write_vgpr_flat_read body: | @@ -758,9 +713,8 @@ body: | ... # GCN-LABEL: name: smfmac32x32_write_vgpr_flat_read # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: FLAT_STORE_DWORD name: smfmac32x32_write_vgpr_flat_read body: | @@ -770,8 +724,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: FLAT_STORE_DWORD @@ -783,8 +736,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_vgpr_flat_read_overlap body: | @@ -794,8 +746,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_vgpr_flat_read_full body: | @@ -805,8 +756,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma16x16_write_vgpr_flat_read @@ -827,9 +777,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_read body: | @@ -839,8 +788,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 @@ -862,11 +810,8 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MOV_B32 name: dmfma16x16_write_vgpr_valu_read @@ -887,9 +832,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_accv_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: xdl_smfma16x16_write_vgpr_accv_read body: | @@ -899,8 +843,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_accv_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 @@ -932,11 +875,8 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_DOT @@ -958,9 +898,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MOV_B32 name: xdl_smfma16x16_write_vgpr_valu_write body: | @@ -970,8 +909,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32 @@ -993,9 +931,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_f16_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_FMA_F16_e64 name: xdl_smfma16x16_write_vgpr_valu_f16_write body: | @@ -1005,8 +942,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_f16_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_FMA_F16_e64 @@ -1028,9 +964,8 @@ body: | ... # GCN-LABEL: name: xdl_smfma16x16_write_vgpr_valu_sdwa_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MOV_B32_sdwa name: xdl_smfma16x16_write_vgpr_valu_sdwa_write body: | @@ -1040,8 +975,7 @@ body: | ... # GCN-LABEL: name: xdl_smfma32x32_write_vgpr_valu_sdwa_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MOV_B32_sdwa @@ -1063,8 +997,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_MOV_B32 name: dmfma16x16_write_vgpr_valu_write body: | @@ -1379,11 +1312,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_read_overlap @@ -1404,11 +1334,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 0 # GCN-NEXT: V_MFMA @@ -1430,9 +1357,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap body: | @@ -1442,8 +1368,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -1485,11 +1410,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap @@ -1510,11 +1432,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA @@ -1546,11 +1465,8 @@ body: | ... # GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_MFMA name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap @@ -1561,8 +1477,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_agpr_flat_read_overlap body: | @@ -1572,8 +1487,7 @@ body: | ... # GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_NOP 8 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma4x4_write_agpr_flat_read_full body: | @@ -1583,8 +1497,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_flat_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: FLAT_STORE_DWORD name: dmfma16x16_write_agpr_flat_read @@ -1605,11 +1518,8 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_valu_read # GCN: V_MFMA -# GFX942-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 - -# GFX950-NEXT: S_NOP 7 -# GFX950-NEXT: S_NOP 7 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 15 # GFX950-NEXT: S_NOP 2 # GCN-NEXT: V_ACCVGPR_READ_B32_e64 name: dmfma16x16_write_agpr_valu_read @@ -1630,8 +1540,7 @@ body: | ... # GCN-LABEL: name: dmfma16x16_write_agpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GCN-NEXT: S_NOP 10 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: dmfma16x16_write_agpr_valu_write body: | @@ -1840,9 +1749,8 @@ body: | ... # GCN-LABEL: name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_SMFMAC name: smfmac32x32x32_mfma_write_agpr_mfma_read_overlap body: | @@ -1959,8 +1867,7 @@ body: | ... # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: BUFFER_STORE_DWORD name: nonxdl_8pass_smfma16x16_write_vgpr_vm_read body: | @@ -1970,8 +1877,7 @@ body: | ... # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_MOV_B32 name: nonxdl_8pass_smfma16x16_write_vgpr_valu_read body: | @@ -1981,8 +1887,7 @@ body: | ... # GCN-LABEL: name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_MOV_B32 name: nonxdl_8pass_smfma16x16_write_vgpr_valu_write body: | @@ -1992,8 +1897,7 @@ body: | ... # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_vm_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: BUFFER_STORE_DWORD name: nonxdl_smfma32x32_write_vgpr_vm_read @@ -2004,8 +1908,7 @@ body: | ... # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_read # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: V_MOV_B32 name: nonxdl_smfma32x32_write_vgpr_valu_read @@ -2016,8 +1919,7 @@ body: | ... # GCN-LABEL: name: nonxdl_smfma32x32_write_vgpr_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: V_MOV_B32 name: nonxdl_smfma32x32_write_vgpr_valu_write @@ -2109,9 +2011,8 @@ body: | ... # GCN-LABEL: name: smfmac32x32_read_vgpr_srcc_valu_write # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MOV_B32 name: smfmac32x32_read_vgpr_srcc_valu_write body: | @@ -2121,8 +2022,7 @@ body: | ... # GCN-LABEL: name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 6 +# GCN-NEXT: S_NOP 14 # GCN-NEXT: V_MOV_B32 name: xdl_sgemm32x32_mfma_read_vgpr_srcc_valu_write body: | @@ -2337,9 +2237,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc body: | @@ -2353,9 +2252,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca body: | @@ -2369,9 +2267,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb body: | @@ -2385,8 +2282,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -2403,8 +2299,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA @@ -2420,8 +2315,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA @@ -2450,8 +2344,7 @@ body: | # 8 pass source # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca body: | @@ -2464,8 +2357,7 @@ body: | # 8 pass source # GCN-LABEL: name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: nonxdl_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb body: | @@ -2477,9 +2369,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcc body: | @@ -2492,9 +2383,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca body: | @@ -2507,9 +2397,8 @@ body: | # 8 pass source # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 2 -# GFX950-NEXT: S_NOP 3 +# GFX942-NEXT: S_NOP 10 +# GFX950-NEXT: S_NOP 11 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb body: | @@ -2522,8 +2411,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_MFMA @@ -2539,8 +2427,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA @@ -2557,8 +2444,7 @@ body: | # 16 pass source # GCN-LABEL: name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 2 # GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA @@ -2603,9 +2489,8 @@ body: | ... # GCN-LABEL: name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GFX942-NEXT: S_NOP 0 -# GFX950-NEXT: S_NOP 1 +# GFX942-NEXT: S_NOP 8 +# GFX950-NEXT: S_NOP 9 # GCN-NEXT: V_SMFMAC_ name: xdl_8pass_mfma_write_agpr_smfmac_read_overlap_srcc body: | @@ -2617,8 +2502,7 @@ body: | ... # GCN-LABEL: name: xdl_16pass_mfma_write_agpr_smfmac_read_overlap_srcc # GCN: V_MFMA -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GFX942-NEXT: S_NOP 0 # GFX950-NEXT: S_NOP 1 # GCN-NEXT: V_SMFMAC_ diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index c01c2be23b83f..7708c8fc00609 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -15,8 +15,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $mode, implicit $exec @@ -37,8 +36,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 1, 1, implicit $mode, implicit $exec @@ -59,8 +57,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 2, 0, implicit $mode, implicit $exec @@ -81,8 +78,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 2, implicit $mode, implicit $exec @@ -163,8 +159,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 15 ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 @@ -186,8 +181,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, killed $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -208,8 +202,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 7 + ; GCN-NEXT: S_NOP 15 ; GCN-NEXT: S_NOP 1 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 @@ -231,8 +224,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 1 + ; GCN-NEXT: S_NOP 9 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_vgprcd_e64 $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 2, 2, $vgpr33, $vgpr32, 12, 4, implicit $mode, implicit $exec @@ -253,8 +245,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 3 + ; GCN-NEXT: S_NOP 11 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $vgpr33, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $vgpr33, $vgpr21, 12, 4, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index ce67a2eec93bc..61f2629dded83 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -157,8 +157,7 @@ body: | # GCN-LABEL: name: mfma_16x16_write_agpr_accvgpr_read # GCN: V_MFMA_F32_16X16X1F32 -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 1 +# GCN-NEXT: S_NOP 9 # GCN-NEXT: V_ACCVGPR_READ_B32_e64 name: mfma_16x16_write_agpr_accvgpr_read body: | @@ -170,8 +169,7 @@ body: | # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_read # GCN: V_MFMA_F32_32X32X2F32 -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 15 # GCN-NEXT: S_NOP 1 # GCN-NEXT: V_ACCVGPR_READ_B32_e64 name: mfma_32x32_write_agpr_accvgpr_read @@ -208,8 +206,7 @@ body: | # GCN-LABEL: name: mfma_32x32_write_agpr_accvgpr_write # GCN: V_MFMA_F32_32X32X2F32 -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 6 +# GCN-NEXT: S_NOP 14 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: mfma_32x32_write_agpr_accvgpr_write body: | @@ -244,8 +241,7 @@ body: | # GCN-LABEL: name: mfma_32x32_read_srcc_accvgpr_write # GCN: V_MFMA_F32_32X32X2F32 -# GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 4 +# GCN-NEXT: S_NOP 12 # GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 name: mfma_32x32_read_srcc_accvgpr_write body: | diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index f7aaa3ec4d0ed..9585c486aeb9e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -84,8 +84,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -227,8 +226,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -347,8 +345,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -454,8 +451,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr add ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -561,8 +557,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addr ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -690,8 +685,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -835,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace( ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 @@ -977,8 +970,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 @@ -1079,8 +1071,7 @@ define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) ; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index d39daaade677f..3b8efafba06f4 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -54,8 +54,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -148,8 +147,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -208,8 +206,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -288,8 +285,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -383,8 +379,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -444,8 +439,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -518,8 +512,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -612,8 +605,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -672,8 +664,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -783,8 +774,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -909,8 +899,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1001,8 +990,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1075,8 +1063,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -1170,8 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1231,8 +1217,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1344,8 +1329,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -1441,8 +1425,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1504,8 +1487,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1614,8 +1596,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -1712,8 +1693,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1776,8 +1756,7 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1856,8 +1835,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -1919,8 +1897,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1948,8 +1925,7 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2019,8 +1995,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 15 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX908-NEXT: s_nop 1 @@ -2065,8 +2040,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -2118,8 +2092,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 @@ -2163,8 +2136,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2182,8 +2154,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 @@ -2227,8 +2198,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 3 +; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2349,8 +2319,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 2 +; GFX908-NEXT: s_nop 10 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 @@ -2453,8 +2422,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: s_nop 9 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2523,8 +2491,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: s_nop 8 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll index e6d7b14381d7a..51cd564bdece3 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll @@ -93,8 +93,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] -; GREEDY908-NEXT: s_nop 7 -; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 ; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 @@ -158,8 +157,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] -; GREEDY908-NEXT: s_nop 7 -; GREEDY908-NEXT: s_nop 7 +; GREEDY908-NEXT: s_nop 15 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 @@ -263,8 +261,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] -; GREEDY90A-NEXT: s_nop 7 -; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 15 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 @@ -298,8 +295,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] -; GREEDY90A-NEXT: s_nop 7 -; GREEDY90A-NEXT: s_nop 7 +; GREEDY90A-NEXT: s_nop 15 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 @@ -356,8 +352,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31] -; GREEDY942-NEXT: s_nop 7 -; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 15 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 @@ -391,8 +386,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; GREEDY942-NEXT: s_nop 7 -; GREEDY942-NEXT: s_nop 7 +; GREEDY942-NEXT: s_nop 15 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 @@ -448,8 +442,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] -; GREEDY90A-GISEL-NEXT: s_nop 7 -; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 15 ; GREEDY90A-GISEL-NEXT: s_nop 2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33 @@ -484,8 +477,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-GISEL-NEXT: s_nop 7 -; GREEDY90A-GISEL-NEXT: s_nop 7 +; GREEDY90A-GISEL-NEXT: s_nop 15 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 @@ -542,8 +534,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] -; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 ; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 @@ -609,8 +600,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] -; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 7 +; FAST90A-NEXT: s_nop 15 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 @@ -676,8 +666,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY908-NEXT: s_nop 7 -; GREEDY908-NEXT: s_nop 0 +; GREEDY908-NEXT: s_nop 8 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 ; GREEDY908-NEXT: s_nop 0 @@ -685,8 +674,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY908-NEXT: s_nop 7 -; GREEDY908-NEXT: s_nop 1 +; GREEDY908-NEXT: s_nop 9 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 @@ -744,14 +732,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] -; GREEDY90A-NEXT: s_nop 7 -; GREEDY90A-NEXT: s_nop 1 +; GREEDY90A-NEXT: s_nop 9 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] -; GREEDY90A-NEXT: s_nop 7 -; GREEDY90A-NEXT: s_nop 2 +; GREEDY90A-NEXT: s_nop 10 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 @@ -786,14 +772,12 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] -; GREEDY942-NEXT: s_nop 7 -; GREEDY942-NEXT: s_nop 0 +; GREEDY942-NEXT: s_nop 8 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] -; GREEDY942-NEXT: s_nop 7 -; GREEDY942-NEXT: s_nop 1 +; GREEDY942-NEXT: s_nop 9 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 @@ -827,8 +811,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] -; GREEDY90A-GISEL-NEXT: s_nop 7 -; GREEDY90A-GISEL-NEXT: s_nop 2 +; GREEDY90A-GISEL-NEXT: s_nop 10 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18 @@ -846,8 +829,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GREEDY90A-GISEL-NEXT: s_nop 7 -; GREEDY90A-GISEL-NEXT: s_nop 1 +; GREEDY90A-GISEL-NEXT: s_nop 9 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 @@ -882,8 +864,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] -; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 ; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 @@ -900,8 +881,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] -; FAST90A-NEXT: s_nop 7 -; FAST90A-NEXT: s_nop 2 +; FAST90A-NEXT: s_nop 10 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir index df3dd7292b7f8..4d1a663aace42 100644 --- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir +++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir @@ -372,14 +372,12 @@ body: | ; ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; gfx908-PAD75-NEXT: S_NOP 7 - ; gfx908-PAD75-NEXT: S_NOP 3 + ; gfx908-PAD75-NEXT: S_NOP 11 ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; gfx908-PAD100-NEXT: S_NOP 7 - ; gfx908-PAD100-NEXT: S_NOP 7 + ; gfx908-PAD100-NEXT: S_NOP 15 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass @@ -393,8 +391,7 @@ body: | ; ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; gfx90a-PAD100-NEXT: S_NOP 7 - ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 15 ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass @@ -408,8 +405,7 @@ body: | ; ; gfx942-PAD100-LABEL: name: mfma_padding_16_pass ; gfx942-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec - ; gfx942-PAD100-NEXT: S_NOP 7 - ; gfx942-PAD100-NEXT: S_NOP 7 + ; gfx942-PAD100-NEXT: S_NOP 15 ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec @@ -459,8 +455,7 @@ body: | ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; gfx908-PAD100-NEXT: S_NOP 7 - ; gfx908-PAD100-NEXT: S_NOP 3 + ; gfx908-PAD100-NEXT: S_NOP 11 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu @@ -486,8 +481,7 @@ body: | ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; gfx90a-PAD100-NEXT: S_NOP 7 - ; gfx90a-PAD100-NEXT: S_NOP 3 + ; gfx90a-PAD100-NEXT: S_NOP 11 ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu @@ -513,8 +507,7 @@ body: | ; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec ; gfx942-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec ; gfx942-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; gfx942-PAD100-NEXT: S_NOP 7 - ; gfx942-PAD100-NEXT: S_NOP 3 + ; gfx942-PAD100-NEXT: S_NOP 11 ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec @@ -887,8 +880,7 @@ body: | ; gfx908-PAD75-NEXT: {{ $}} ; gfx908-PAD75-NEXT: bb.2: ; gfx908-PAD75-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; gfx908-PAD75-NEXT: S_NOP 7 - ; gfx908-PAD75-NEXT: S_NOP 1 + ; gfx908-PAD75-NEXT: S_NOP 9 ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds @@ -905,8 +897,7 @@ body: | ; gfx908-PAD100-NEXT: {{ $}} ; gfx908-PAD100-NEXT: bb.2: ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; gfx908-PAD100-NEXT: S_NOP 7 - ; gfx908-PAD100-NEXT: S_NOP 5 + ; gfx908-PAD100-NEXT: S_NOP 13 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds @@ -956,8 +947,7 @@ body: | ; gfx90a-PAD100-NEXT: {{ $}} ; gfx90a-PAD100-NEXT: bb.2: ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; gfx90a-PAD100-NEXT: S_NOP 7 - ; gfx90a-PAD100-NEXT: S_NOP 5 + ; gfx90a-PAD100-NEXT: S_NOP 13 ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; ; gfx942-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds @@ -1007,8 +997,7 @@ body: | ; gfx942-PAD100-NEXT: {{ $}} ; gfx942-PAD100-NEXT: bb.2: ; gfx942-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec - ; gfx942-PAD100-NEXT: S_NOP 7 - ; gfx942-PAD100-NEXT: S_NOP 5 + ; gfx942-PAD100-NEXT: S_NOP 13 ; gfx942-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec bb.0: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll index cefcd7e0d2651..fc154604b8700 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll @@ -33,8 +33,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi(ptr addrspace(1) % ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ; implicit-def: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; CHECK-NEXT: .LBB0_3: ; %if -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: global_load_dwordx4 a[28:31], v32, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 a[24:27], v32, s[0:1] offset:96 ; CHECK-NEXT: global_load_dwordx4 a[20:23], v32, s[0:1] offset:80 @@ -98,8 +97,7 @@ define amdgpu_kernel void @test_rewrite_mfma_copy_to_agpr_phi_loop(ptr addrspace ; CHECK-NEXT: .LBB1_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: v_mov_b64_e32 v[62:63], v[30:31] ; CHECK-NEXT: v_mov_b64_e32 v[60:61], v[28:29] ; CHECK-NEXT: v_mov_b64_e32 v[58:59], v[26:27] diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index 5f42abbeae253..b9e9893ede4e2 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -60,8 +60,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], a0, a1, v[0:31] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: v_mov_b32_e32 v2, v32 ; CHECK-NEXT: v_mov_b32_e32 v3, v33 @@ -96,8 +95,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma(ptr addrsp ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[0:31] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 @@ -143,8 +141,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_noshuffle( ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; CHECK-NEXT: v_mov_b32_e32 v32, 0 -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 @@ -178,8 +175,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm0_src2( ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 @@ -212,8 +208,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_rewrite_vgpr_mfma_imm1_src2( ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 @@ -351,8 +346,7 @@ define void @test_rewrite_mfma_subreg_extract2(float %arg0, float %arg1, ptr add ; CHECK-NEXT: global_load_dwordx4 a[0:3], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 1 ; CHECK-NEXT: v_accvgpr_mov_b32 a0, a1 ; CHECK-NEXT: v_accvgpr_mov_b32 a1, a2 @@ -717,8 +711,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class_chain(p ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 15 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; CHECK-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 @@ -777,8 +770,7 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3] ; CHECK-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1] -; CHECK-NEXT: s_nop 7 -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_nop 8 ; CHECK-NEXT: global_store_dwordx2 v[2:3], a[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 2040e2b26cb15..290d9c5401154 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -258,8 +258,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v4, v4, a[0:31] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: s_nop 7 -; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: s_nop 13 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND @@ -339,8 +338,7 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v2, a[0:31] -; GFX90A-NEXT: s_nop 7 -; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 ; GFX90A-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index d8264b5a091e1..b045c761436de 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -958,8 +958,7 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3 -; GFX942-NEXT: s_nop 7 -; GFX942-NEXT: s_nop 7 +; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96