22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
Expand All @@ -29,6 +30,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
Expand All @@ -43,6 +45,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
Expand All @@ -55,6 +58,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
Expand All @@ -69,6 +73,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -81,6 +86,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -95,6 +101,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -108,6 +115,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -120,6 +128,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -132,6 +141,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -144,6 +154,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -156,6 +167,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -168,6 +180,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -180,6 +193,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -194,6 +208,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -206,6 +221,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -218,6 +234,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -230,6 +247,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -242,6 +260,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -254,6 +273,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -266,6 +286,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -278,6 +299,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64: ; %bb.0: ; %bb
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, s2
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, s2
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
Expand Down Expand Up @@ -582,6 +583,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, 0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, 0, vcc_lo
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
bb:
%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/mad_64_32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,7 @@ define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: v_mov_b32_e32 v1, s3
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%ext0 = zext i32 %arg0 to i64
%ext1 = zext i32 %arg1 to i64
Expand Down
120 changes: 120 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
Expand All @@ -139,6 +140,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -277,6 +279,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
Expand All @@ -292,6 +295,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -418,6 +422,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
Expand All @@ -431,6 +436,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -569,6 +575,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
Expand All @@ -584,6 +591,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down
184 changes: 184 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll

Large diffs are not rendered by default.

120 changes: 120 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_0:
Expand All @@ -89,6 +90,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -174,6 +176,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_load_1:
Expand All @@ -190,6 +193,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -269,6 +273,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_0:
Expand All @@ -283,6 +288,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -368,6 +374,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_nontemporal_store_1:
Expand All @@ -384,6 +391,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -463,6 +471,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
Expand All @@ -477,6 +486,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
Expand Down Expand Up @@ -549,6 +559,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
Expand All @@ -562,6 +573,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
Expand Down
182 changes: 182 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll

Large diffs are not rendered by default.

148 changes: 148 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_0:
Expand All @@ -138,6 +139,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -271,6 +273,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_load_1:
Expand All @@ -282,6 +285,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -408,6 +412,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_0:
Expand All @@ -419,6 +424,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -547,6 +553,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_nontemporal_store_1:
Expand All @@ -558,6 +565,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down
184 changes: 184 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll

Large diffs are not rendered by default.

118 changes: 118 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_0:
Expand All @@ -89,6 +90,7 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -178,6 +180,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_load_1:
Expand All @@ -189,6 +192,7 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -277,6 +281,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_0:
Expand All @@ -289,6 +294,7 @@ define amdgpu_kernel void @global_volatile_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -379,6 +385,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_store_1:
Expand All @@ -391,6 +398,7 @@ define amdgpu_kernel void @global_volatile_store_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -475,6 +483,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load:
Expand All @@ -485,6 +494,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -566,6 +576,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_volatile_workgroup_release_store:
Expand All @@ -578,6 +589,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
Expand Down
184 changes: 184 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll

Large diffs are not rendered by default.

152 changes: 152 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_0:
Expand All @@ -152,6 +153,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -291,6 +293,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_nontemporal_load_1:
Expand All @@ -304,6 +307,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_0:
Expand All @@ -100,6 +101,7 @@ define amdgpu_kernel void @local_volatile_load_0(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -191,6 +193,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-WGP-NEXT: ds_load_b32 v0, v0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: local_volatile_load_1:
Expand All @@ -204,6 +207,7 @@ define amdgpu_kernel void @local_volatile_load_1(
; GFX11-CU-NEXT: ds_load_b32 v0, v0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_0:
Expand All @@ -174,6 +175,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -338,6 +340,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_load_1:
Expand All @@ -350,6 +353,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -509,6 +513,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_0:
Expand All @@ -521,6 +526,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down Expand Up @@ -684,6 +690,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_nontemporal_store_1:
Expand All @@ -697,6 +704,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_0:
Expand All @@ -118,6 +119,7 @@ define amdgpu_kernel void @private_volatile_load_0(
; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -228,6 +230,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_load_1:
Expand All @@ -240,6 +243,7 @@ define amdgpu_kernel void @private_volatile_load_1(
; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
Expand Down Expand Up @@ -353,6 +357,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_0:
Expand All @@ -366,6 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0(
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down Expand Up @@ -481,6 +487,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: private_volatile_store_1:
Expand All @@ -495,6 +502,7 @@ define amdgpu_kernel void @private_volatile_store_1(
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
Expand Down
411 changes: 411 additions & 0 deletions llvm/test/CodeGen/AMDGPU/release-vgprs.mir

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
Expand All @@ -53,6 +54,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
Expand All @@ -75,6 +77,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
Expand All @@ -95,6 +98,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
Expand All @@ -117,6 +121,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -137,6 +142,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16
; W32-NEXT: global_store_b128 v[26:27], v[16:19], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -159,6 +165,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -179,6 +186,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -199,6 +207,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -219,6 +228,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -239,6 +249,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -259,6 +270,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -279,6 +291,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -299,6 +312,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16
; W32-NEXT: global_store_b128 v[18:19], v[8:11], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -321,6 +335,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -341,6 +356,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -361,6 +377,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -381,6 +398,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
Expand All @@ -402,6 +420,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -422,6 +441,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -442,6 +462,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand All @@ -462,6 +483,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16
; W32-NEXT: global_store_b128 v[14:15], v[4:7], off
; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W32-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 1)
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
Expand All @@ -45,6 +46,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
Expand All @@ -63,6 +65,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
Expand All @@ -79,6 +82,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
Expand All @@ -97,6 +101,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -113,6 +118,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -131,6 +137,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -148,6 +155,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed(<4 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -164,6 +172,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned(<4 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -180,6 +189,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed(<4 x i32> %A, <4
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -196,6 +206,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned_clamp(<4 x i
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -212,6 +223,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_signed_clamp(<4 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 0, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -228,6 +240,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_unsigned_clamp(<4 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 0, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -244,6 +257,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_signed_signed_clamp(<4 x i32>
; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[12:13], v[16:19], off
; W64-NEXT: global_store_b128 v[14:15], v[8:11], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 1, <4 x i32> %A, i1 1, <4 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -262,6 +276,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned(<2 x i32> %A
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -278,6 +293,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed(<2 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -294,6 +310,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned(<2 x i32> %A,
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -310,6 +327,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed(<2 x i32> %A, <2
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0]
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 0)
Expand All @@ -326,6 +344,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_unsigned_clamp(<2 x i
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -342,6 +361,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_unsigned_signed_clamp(<2 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -358,6 +378,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_unsigned_clamp(<2 x i32
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand All @@ -374,6 +395,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x16_ui4_signed_signed_clamp(<2 x i32>
; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp
; W64-NEXT: global_store_b128 v[8:9], v[12:15], off
; W64-NEXT: global_store_b128 v[10:11], v[4:7], off
; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; W64-NEXT: s_endpgm
bb:
%res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 1, <2 x i32> %A, i1 1, <2 x i32> %B, <4 x i32> %C, i1 1)
Expand Down