Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
; GFX12-NEXT: image_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
Expand Down Expand Up @@ -155,7 +155,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
; GFX12-NEXT: v_mov_b32_e32 v4, v13
; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v9, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
Expand Down Expand Up @@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
; GFX12-NEXT: v_mov_b32_e32 v4, v13
; GFX12-NEXT: image_load v[0:4], [v5, v6, v7, v8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v9, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
; GFX12-NEXT: image_load v[0:3], [v0, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
Expand Down Expand Up @@ -158,7 +158,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
; GFX12-NEXT: v_mov_b32_e32 v4, v10
; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v6, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
Expand Down Expand Up @@ -270,7 +270,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX12-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
; GFX12-NEXT: v_mov_b32_e32 v4, v10
; GFX12-NEXT: image_load v[0:4], [v11, v5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D a16 tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v6, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s,
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
; GFX12-NEXT: image_load v[0:3], [v0, v1, v2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
Expand Down Expand Up @@ -151,7 +151,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
; GFX12-NEXT: v_mov_b32_e32 v4, v12
; GFX12-NEXT: image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v8, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
Expand Down Expand Up @@ -262,7 +262,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
; GFX12-NEXT: v_mov_b32_e32 v4, v12
; GFX12-NEXT: image_load v[0:4], [v5, v6, v7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D tfe
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v8, v4, s[10:11]
; GFX12-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-LABEL: sample_d_1d:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -54,7 +54,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -91,7 +91,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX12-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v1, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -114,7 +114,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-LABEL: sample_c_d_1d:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -149,7 +149,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX12-NEXT: v_lshl_or_b32 v2, v4, 16, v3
; GFX12-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -172,7 +172,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-LABEL: sample_d_cl_1d:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -207,7 +207,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v1, v3, 16, v2
; GFX12-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -230,7 +230,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-LABEL: sample_c_d_cl_1d:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[3:4]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -267,7 +267,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX12-NEXT: v_lshl_or_b32 v2, v4, 16, v3
; GFX12-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -306,7 +306,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX12-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX12-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down Expand Up @@ -345,7 +345,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX12-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX12-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX12-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
48 changes: 40 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ define float @v_rsq_clamp_f32(float %src) #0 {
;
; GFX12-LABEL: v_rsq_clamp_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e32 v0, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
Expand All @@ -47,7 +51,11 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 {
;
; GFX12-LABEL: v_rsq_clamp_fabs_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e64 v0, |v0|
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
Expand Down Expand Up @@ -78,7 +86,11 @@ define double @v_rsq_clamp_f64(double %src) #0 {
;
; GFX12-LABEL: v_rsq_clamp_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
Expand Down Expand Up @@ -112,7 +124,11 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
;
; GFX12-LABEL: v_rsq_clamp_fabs_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
Expand Down Expand Up @@ -144,7 +160,11 @@ define float @v_rsq_clamp_undef_f32() #0 {
;
; GFX12-LABEL: v_rsq_clamp_undef_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_s_rsq_f32 s0, s0
; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
Expand Down Expand Up @@ -174,7 +194,11 @@ define double @v_rsq_clamp_undef_f64() #0 {
;
; GFX12-LABEL: v_rsq_clamp_undef_f64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], s[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
Expand Down Expand Up @@ -205,7 +229,11 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
;
; GFX12-LABEL: v_rsq_clamp_f32_non_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f32_e32 v0, v0
; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff
; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
Expand Down Expand Up @@ -235,7 +263,11 @@ define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
;
; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; GFX12-NEXT: s_mov_b32 s0, -1
; GFX12-NEXT: s_mov_b32 s1, 0x7fefffff
Expand Down
120 changes: 80 additions & 40 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,22 @@
define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX12-UNALIGNED: ; %bb.0:
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX12-NOUNALIGNED: ; %bb.0:
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_clause 0xb
; GFX12-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off
; GFX12-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1
Expand All @@ -32,23 +40,23 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
; GFX12-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
; GFX12-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x9
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v5
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x5
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v9
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
Expand Down Expand Up @@ -217,26 +225,34 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX12-UNALIGNED: ; %bb.0:
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX12-NOUNALIGNED: ; %bb.0:
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_clause 0x5
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8
; GFX12-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
; GFX12-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
Expand Down Expand Up @@ -332,9 +348,13 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
Expand Down Expand Up @@ -374,9 +394,13 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_i96_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
Expand Down Expand Up @@ -416,9 +440,13 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
Expand Down Expand Up @@ -458,9 +486,13 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
Expand Down Expand Up @@ -509,9 +541,13 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0
; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0
Expand Down Expand Up @@ -598,9 +634,13 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
; GFX12-LABEL: v_load_constant_v3i32_align16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
Expand Down Expand Up @@ -638,7 +678,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX12-UNALIGNED: ; %bb.0:
; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
Expand All @@ -659,7 +699,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
; GFX12-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
; GFX12-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
; GFX12-NOUNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
Expand Down Expand Up @@ -852,7 +892,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX12-UNALIGNED: ; %bb.0:
; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
; GFX12-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX12-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
Expand All @@ -867,7 +907,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0
; GFX12-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4
; GFX12-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8
; GFX12-NOUNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16
; GFX12-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
Expand Down Expand Up @@ -978,7 +1018,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
; GFX12-LABEL: s_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v3i32_align4:
Expand Down Expand Up @@ -1015,7 +1055,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_i96_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_i96_align8:
Expand Down Expand Up @@ -1052,7 +1092,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
; GFX12-LABEL: s_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v3i32_align8:
Expand Down Expand Up @@ -1089,7 +1129,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
; GFX12-LABEL: s_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_load_constant_v6i16_align8:
Expand Down Expand Up @@ -1127,7 +1167,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
; GFX12-LABEL: s_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
; GFX12-NEXT: s_lshr_b32 s3, s0, 24
Expand Down Expand Up @@ -1204,7 +1244,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg
; GFX12-LABEL: s_load_constant_v3i32_align16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
; GCN-LABEL: s_load_constant_v3i32_align16:
Expand Down
84 changes: 42 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr(ptr addrspace(1) inreg %ptr) {
; GFX12-LABEL: mubuf_load_sgpr_ptr:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%val = load volatile float, ptr addrspace(1) %ptr
ret float %val
Expand Down Expand Up @@ -704,8 +704,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095(ptr addrspace(1) inreg %p
; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4095:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16380 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -747,8 +747,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967296(ptr addrspace(1) in
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -790,8 +790,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4294967297(ptr addrspace(1) in
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -824,8 +824,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4096(ptr addrspace(1) inreg %p
; GFX12-LABEL: mubuf_load_sgpr_ptr_offset4096:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16384 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -855,8 +855,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4095(ptr addrspace(1) %ptr) {
;
; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4095:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4095
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -893,8 +893,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967296(ptr addrspace(1) %p
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967296
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -931,8 +931,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4294967297(ptr addrspace(1) %p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4294967297
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -962,8 +962,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_offset4096(ptr addrspace(1) %ptr) {
;
; GFX12-LABEL: mubuf_load_vgpr_ptr_offset4096:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16384 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 4096
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -1007,8 +1007,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_sgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_co_u32 s0, s2, s0
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, s1
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -1045,8 +1045,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset(ptr addrspace(1) %ptr, i
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -1083,8 +1083,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %soffset
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 256
Expand Down Expand Up @@ -1122,8 +1122,8 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:1024 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 256
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %soffset
Expand Down Expand Up @@ -1165,8 +1165,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset(ptr addrspace(1) inreg %
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
%val = load volatile float, ptr addrspace(1) %gep
Expand Down Expand Up @@ -1209,8 +1209,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_vgpr_offset_offset4095(ptr addrspace
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i32 %voffset
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i64 4095
Expand Down Expand Up @@ -1253,8 +1253,8 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(ptr addrspace
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:16380 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 4095
%gep1 = getelementptr float, ptr addrspace(1) %gep0, i32 %voffset
Expand Down Expand Up @@ -1294,7 +1294,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
; GFX12: ; %bb.0:
; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
Expand Down Expand Up @@ -1345,7 +1345,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
Expand Down Expand Up @@ -1386,7 +1386,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
Expand Down Expand Up @@ -1434,7 +1434,7 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 2
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
Expand Down Expand Up @@ -1486,7 +1486,7 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
Expand Down Expand Up @@ -1530,7 +1530,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
Expand Down Expand Up @@ -1583,7 +1583,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
Expand Down Expand Up @@ -1625,7 +1625,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v4, v2
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095
Expand Down Expand Up @@ -1673,7 +1673,7 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v6, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296
Expand Down Expand Up @@ -1726,7 +1726,7 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: ; return to shader part epilog
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %voffset
Expand Down
66 changes: 51 additions & 15 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
;
; GFX12-LABEL: v_mul_i16:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
Expand Down Expand Up @@ -152,7 +156,11 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
;
; GFX12-LABEL: v_mul_i16_zeroext:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
Expand Down Expand Up @@ -237,7 +245,11 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
;
; GFX12-LABEL: v_mul_i16_signext:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
Expand Down Expand Up @@ -280,7 +292,11 @@ define i32 @v_mul_i32(i32 %num, i32 %den) {
;
; GFX12-LABEL: v_mul_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
Expand Down Expand Up @@ -326,7 +342,11 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
;
; GFX12-LABEL: v_mul_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -480,7 +500,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
;
; GFX12-LABEL: v_mul_i64:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
Expand Down Expand Up @@ -653,7 +677,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
;
; GFX12-LABEL: v_mul_i96:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX12-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
Expand Down Expand Up @@ -989,7 +1017,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
;
; GFX12-LABEL: v_mul_i128:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_mov_b32_e32 v10, v2
; GFX12-NEXT: v_mul_lo_u32 v3, v3, v4
Expand Down Expand Up @@ -2352,7 +2384,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
;
; GFX12-LABEL: v_mul_i256:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
; GFX12-NEXT: v_mul_lo_u32 v7, v7, v8
Expand Down Expand Up @@ -2496,7 +2532,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b32 v2, v[2:3], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_nop 0
Expand Down Expand Up @@ -2591,10 +2627,10 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
Expand Down Expand Up @@ -2673,7 +2709,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12: ; %bb.0:
; GFX12-NEXT: global_load_b32 v2, v[2:3], off
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_nop 0
Expand Down Expand Up @@ -2783,9 +2819,9 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_ashr_i32 s3, s2, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
Expand Down
42 changes: 21 additions & 21 deletions llvm/test/CodeGen/AMDGPU/add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX12-LABEL: s_add_i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s2, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
Expand Down Expand Up @@ -170,9 +170,9 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-LABEL: s_add_v2i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s4, s6
; GFX12-NEXT: s_add_co_i32 s3, s5, s7
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
Expand Down Expand Up @@ -287,9 +287,9 @@ define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-LABEL: s_add_v4i32:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s3, s3, s7
; GFX12-NEXT: s_add_co_i32 s2, s2, s6
; GFX12-NEXT: s_add_co_i32 s0, s0, s4
Expand Down Expand Up @@ -454,7 +454,7 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s7, s15
; GFX12-NEXT: s_add_co_i32 s3, s6, s14
; GFX12-NEXT: s_add_co_i32 s6, s11, s19
Expand Down Expand Up @@ -728,7 +728,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s2, s7, s39
; GFX12-NEXT: s_add_co_i32 s3, s6, s38
; GFX12-NEXT: s_add_co_i32 s6, s11, s43
Expand Down Expand Up @@ -859,11 +859,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX12-NEXT: s_nop 0
Expand Down Expand Up @@ -956,9 +956,9 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] th:TH_LOAD_RT_NT
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-NEXT: s_nop 0
Expand Down Expand Up @@ -1051,7 +1051,7 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
Expand Down Expand Up @@ -1157,9 +1157,9 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
Expand Down Expand Up @@ -1308,7 +1308,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-LABEL: add64_in_branch:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
; GFX12-NEXT: ; %bb.1: ; %else
Expand All @@ -1318,7 +1318,7 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: .LBB9_2: ; %if
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
; GFX12-NEXT: .LBB9_3: ; %endif
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
Expand Down
92 changes: 46 additions & 46 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -234,17 +234,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -265,17 +265,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -512,17 +512,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -544,17 +544,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
Expand Down Expand Up @@ -857,17 +857,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -900,16 +900,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1230,18 +1230,18 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_load_b32 s5, s[0:1], 0x44
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mov_b32_e32 v2, s5
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB3_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1276,17 +1276,17 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_load_b32 s8, s[0:1], 0x44
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mov_b32_e32 v2, s8
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1368,10 +1368,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1608,18 +1608,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -1640,18 +1640,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1892,17 +1892,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand All @@ -1925,17 +1925,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand Down Expand Up @@ -2240,17 +2240,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -2283,17 +2283,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -2375,10 +2375,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
128 changes: 64 additions & 64 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Large diffs are not rendered by default.

80 changes: 40 additions & 40 deletions llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -233,17 +233,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -264,17 +264,17 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -511,17 +511,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
Expand All @@ -543,17 +543,17 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
Expand Down Expand Up @@ -856,17 +856,17 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -899,16 +899,16 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -990,10 +990,10 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1230,18 +1230,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W64-NEXT: s_mul_i32 s4, s4, 5
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB4_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -1262,18 +1262,18 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12W32-NEXT: s_mul_i32 s3, s3, 5
; GFX12W32-NEXT: v_mov_b32_e32 v1, s3
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1514,17 +1514,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: ; %bb.1:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_mul_i32 s4, s6, s4
; GFX12W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W64-NEXT: v_mov_b32_e32 v1, s4
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand All @@ -1547,17 +1547,17 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: ; %bb.1:
; GFX12W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W32-NEXT: s_bcnt1_i32_b32 s4, s4
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_mul_i32 s4, s2, s4
; GFX12W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12W32-NEXT: v_mov_b32_e32 v1, s4
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
Expand Down Expand Up @@ -1862,17 +1862,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: ; %bb.3:
; GFX12W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34
; GFX12W64-NEXT: v_mov_b32_e32 v0, s4
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: s_wait_loadcnt 0x0
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1905,17 +1905,17 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; %bb.3:
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12W32-NEXT: v_mov_b32_e32 v0, s2
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down Expand Up @@ -1997,10 +1997,10 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v1, 1
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Loading