32 changes: 0 additions & 32 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr(ptr addrspace(1) inreg %ptr) {
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
store i32 0, ptr addrspace(1) %ptr
ret void
Expand Down Expand Up @@ -65,8 +63,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_store_b32 v0, v0, s[2:3] offset:16380
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -105,8 +101,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967296(ptr addrspace(1) in
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -145,8 +139,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4294967297(ptr addrspace(1) in
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -180,8 +172,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_store_b32 v0, v0, s[2:3] offset:16384
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -213,8 +203,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) {
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -248,8 +236,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -283,8 +269,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967297
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -316,8 +300,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) {
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16384
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4096
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -362,8 +344,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: s_add_co_u32 s0, s2, s0
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, s1
; GFX12-NEXT: global_store_b32 v0, v0, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -402,8 +382,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -442,8 +420,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %soffset
%gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 256
Expand Down Expand Up @@ -483,8 +459,6 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:1024
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 256
%gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %soffset
Expand Down Expand Up @@ -528,8 +502,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
store i32 0, ptr addrspace(1) %gep
Expand Down Expand Up @@ -574,8 +546,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
%gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 4095
Expand Down Expand Up @@ -620,8 +590,6 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off offset:16380
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%gep0 = getelementptr i32, ptr addrspace(1) %ptr, i32 4095
%gep1 = getelementptr i32, ptr addrspace(1) %gep0, i32 %voffset
Expand Down
22 changes: 0 additions & 22 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v7
; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -93,8 +91,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -147,8 +143,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -195,8 +189,6 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -250,8 +242,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -300,8 +290,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -351,8 +339,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -384,8 +370,6 @@ define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addr
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -448,8 +432,6 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
Expand Down Expand Up @@ -481,8 +463,6 @@ define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, p
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -583,8 +563,6 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
16 changes: 0 additions & 16 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2519,8 +2519,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
Expand All @@ -2529,8 +2527,6 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
Expand Down Expand Up @@ -2613,8 +2609,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
Expand All @@ -2629,8 +2623,6 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = zext i32 %val to i64
Expand Down Expand Up @@ -2695,8 +2687,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
; GFX11-NEXT: v_mov_b32_e32 v3, v4
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
Expand All @@ -2705,8 +2695,6 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
Expand Down Expand Up @@ -2804,8 +2792,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_add_i32 s3, s4, s3
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
Expand All @@ -2820,8 +2806,6 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%ext = sext i32 %val to i64
Expand Down
4 changes: 0 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,6 @@ define amdgpu_kernel void @mulu24_shl64(ptr addrspace(1) nocapture %arg) {
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX11-NEXT: global_store_b32 v[2:3], v1, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -404,8 +402,6 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add
; GFX11-NEXT: v_mul_i32_i24_e32 v0, -7, v0
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
Expand All @@ -25,8 +23,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
Expand All @@ -42,8 +38,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -59,8 +53,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -76,8 +68,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -93,8 +83,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
Expand All @@ -123,8 +109,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
Expand All @@ -138,8 +122,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x half> %C
Expand All @@ -153,8 +135,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
Expand All @@ -170,8 +150,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -187,8 +165,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -204,8 +180,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -221,8 +195,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -238,8 +210,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -255,8 +225,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -272,8 +240,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <8 x float> %C
Expand All @@ -289,8 +255,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -306,8 +270,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
Expand All @@ -323,8 +285,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
Expand All @@ -338,8 +298,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
Expand All @@ -353,8 +311,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <16 x half> %B
Expand All @@ -372,8 +328,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
Expand All @@ -388,8 +342,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x ha
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
Expand All @@ -408,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <8 x float> %C, i32 3
Expand All @@ -430,8 +380,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <8 x half> %A
Expand All @@ -445,8 +393,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
Expand Down Expand Up @@ -477,8 +423,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[10:11], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <16 x half>, ptr %Caddr
Expand Down
44 changes: 0 additions & 44 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -38,8 +36,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -54,8 +50,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -84,8 +78,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -98,8 +90,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
Expand All @@ -120,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
Expand All @@ -142,8 +130,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
Expand All @@ -164,8 +150,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
Expand All @@ -180,8 +164,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand Down Expand Up @@ -210,8 +192,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand All @@ -226,8 +206,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand Down Expand Up @@ -256,8 +234,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand All @@ -272,8 +248,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -302,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -318,8 +290,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -348,8 +318,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -364,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -394,8 +360,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -410,8 +374,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
Expand Down Expand Up @@ -440,8 +402,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i3
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -456,8 +416,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand Down Expand Up @@ -486,8 +444,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand Down
36 changes: 0 additions & 36 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -40,8 +36,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -58,8 +52,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
Expand All @@ -74,8 +66,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
Expand All @@ -90,8 +80,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
Expand All @@ -108,8 +96,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -124,8 +110,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -140,8 +124,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -160,8 +142,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -176,8 +156,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -192,8 +170,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
Expand All @@ -210,8 +186,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -226,8 +200,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -242,8 +214,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
Expand All @@ -260,8 +230,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
Expand All @@ -276,8 +244,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
Expand All @@ -292,8 +258,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -51,8 +49,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[24:25], v[12:15], off
; GFX12-NEXT: global_store_b128 v[24:25], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand All @@ -77,8 +73,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand All @@ -103,8 +97,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <1
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
; GFX12-NEXT: global_store_b128 v[18:19], v[22:25], off
; GFX12-NEXT: global_store_b128 v[20:21], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -135,8 +127,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -167,8 +157,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[3:6], off
; GFX12-NEXT: global_store_b128 v[15:16], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -199,8 +187,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -231,8 +217,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -263,8 +247,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down Expand Up @@ -295,8 +277,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[18:19], v[6:9], off
; GFX12-NEXT: global_store_b128 v[18:19], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
Expand Down
44 changes: 0 additions & 44 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
Expand All @@ -24,8 +22,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[16:17], v[8:11], off
; GFX12-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
Expand All @@ -38,8 +34,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
Expand All @@ -52,8 +46,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
Expand All @@ -68,8 +60,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -84,8 +74,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[10:11], v[2:5], off
; GFX12-NEXT: global_store_b128 v[10:11], v[6:9], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
Expand All @@ -100,8 +88,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
Expand All @@ -116,8 +102,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
Expand All @@ -132,8 +116,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
Expand All @@ -148,8 +130,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
Expand All @@ -164,8 +144,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[4:7], off
; GFX12-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -181,8 +159,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
Expand All @@ -197,8 +173,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[21:22], v[12:15], off
; GFX12-NEXT: global_store_b128 v[21:22], v[16:19], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
Expand All @@ -211,8 +185,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
Expand All @@ -225,8 +197,6 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
; GFX12-NEXT: global_store_b128 v[17:18], v[12:15], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
Expand All @@ -241,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -257,8 +225,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[12:13], v[3:6], off
; GFX12-NEXT: global_store_b128 v[12:13], v[7:10], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
Expand All @@ -273,8 +239,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B,
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
Expand All @@ -289,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
Expand All @@ -305,8 +267,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
Expand All @@ -321,8 +281,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
Expand All @@ -337,8 +295,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32>
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[15:16], v[6:9], off
; GFX12-NEXT: global_store_b128 v[15:16], v[10:13], off offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
Expand All @@ -21,8 +19,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
Expand All @@ -36,8 +32,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -51,8 +45,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -66,8 +58,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -81,8 +71,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -96,8 +84,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
Expand All @@ -111,8 +97,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
Expand All @@ -126,8 +110,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x half> %C
Expand All @@ -141,8 +123,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
Expand All @@ -156,8 +136,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -171,8 +149,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -186,8 +162,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -201,8 +175,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -216,8 +188,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -231,8 +201,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -246,8 +214,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.C = fneg <4 x float> %C
Expand All @@ -261,8 +227,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -276,8 +240,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
Expand All @@ -291,8 +253,6 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b128 v[11:12], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
Expand All @@ -306,8 +266,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
Expand All @@ -321,8 +279,6 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[9:10], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <8 x half> %B
Expand All @@ -338,8 +294,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
Expand All @@ -354,8 +308,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x ha
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
Expand All @@ -372,8 +324,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b128 v[8:9], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%el3 = extractelement <4 x float> %C, i32 3
Expand All @@ -392,8 +342,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.A = fneg <4 x half> %A
Expand All @@ -407,8 +355,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%fneg.B = fneg <4 x half> %B
Expand All @@ -432,8 +378,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
; GFX12-NEXT: global_store_b64 v[6:7], v[4:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%C = load <8 x half>, ptr %Caddr
Expand Down
44 changes: 0 additions & 44 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -30,8 +28,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -44,8 +40,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -68,8 +62,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -82,8 +74,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
Expand All @@ -102,8 +92,6 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
Expand All @@ -122,8 +110,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
Expand All @@ -142,8 +128,6 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
; GFX12-NEXT: global_store_b64 v[4:5], v[6:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
Expand All @@ -156,8 +140,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrsp
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand All @@ -180,8 +162,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand All @@ -194,8 +174,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrsp
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand All @@ -218,8 +196,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand All @@ -232,8 +208,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x f
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -256,8 +230,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -270,8 +242,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x f
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -294,8 +264,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -308,8 +276,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x f
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -332,8 +298,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -346,8 +310,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x f
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
Expand All @@ -370,8 +332,6 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
Expand All @@ -384,8 +344,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
Expand All @@ -408,8 +366,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
Expand Down
36 changes: 0 additions & 36 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -20,8 +18,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -34,8 +30,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
Expand All @@ -50,8 +44,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -64,8 +56,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -78,8 +68,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
Expand All @@ -94,8 +82,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -108,8 +94,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
Expand All @@ -122,8 +106,6 @@ define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
; GFX12-NEXT: global_store_b128 v[6:7], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
Expand All @@ -141,8 +123,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
Expand All @@ -155,8 +135,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
Expand All @@ -169,8 +147,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
Expand All @@ -185,8 +161,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
Expand All @@ -199,8 +173,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
Expand All @@ -213,8 +185,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
; GFX12-NEXT: global_store_b128 v[7:8], v[2:5], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
Expand All @@ -229,8 +199,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
Expand All @@ -243,8 +211,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32>
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
Expand All @@ -257,8 +223,6 @@ define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B,
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
; GFX12-NEXT: global_store_b128 v[8:9], v[3:6], off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
Expand Down
Loading