Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
; GFX9-NEXT: s_mov_b32 s7, s9
; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8
; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw:
Expand All @@ -38,6 +39,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -64,6 +66,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX9-NEXT: v_mov_b32_e32 v6, s11
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[5:6], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
Expand All @@ -87,6 +90,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[5:6], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue { <4 x float>, i32 } %v, 0
Expand Down Expand Up @@ -116,6 +120,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX9-NEXT: v_mov_b32_e32 v6, s11
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[5:6], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
Expand All @@ -139,6 +144,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[5:6], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0)
%v.vec = extractvalue { <4 x float>, i32 } %v, 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s,
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw:
Expand All @@ -28,6 +29,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i32 %s,
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
%v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %v
Expand All @@ -51,6 +53,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX6-NEXT: s_mov_b32 s11, 0xf000
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
Expand All @@ -69,6 +72,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[5:6], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0)
%v.vec = extractvalue { <4 x float>, i32 } %v, 0
Expand All @@ -95,6 +99,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX6-NEXT: s_mov_b32 s11, 0xf000
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
Expand All @@ -113,6 +118,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[5:6], v4, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ; return to shader part epilog
%v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
%v.vec = extractvalue { <4 x float>, i32 } %v, 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -26,6 +27,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -45,6 +47,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4
; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -60,6 +63,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -76,6 +80,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -91,6 +96,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -107,6 +113,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -122,6 +129,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -138,6 +146,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -153,6 +162,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -169,6 +179,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -184,6 +195,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -200,6 +212,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -215,6 +228,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -231,6 +245,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -246,6 +261,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -262,6 +278,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -278,6 +295,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11
; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -294,6 +312,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: s_mov_b32 s10, s12
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_l_1d:
Expand All @@ -36,6 +37,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -61,6 +63,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_l_2d:
Expand All @@ -81,6 +84,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -106,6 +110,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12
; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_l_1d:
Expand All @@ -126,6 +131,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -151,6 +157,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_l_2d:
Expand All @@ -171,6 +178,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -196,6 +204,7 @@ define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12
; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_l_o_1d:
Expand All @@ -216,6 +225,7 @@ define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f16(i32 15, i32 %offset, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -241,6 +251,7 @@ define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_l_o_2d:
Expand All @@ -261,6 +272,7 @@ define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -286,6 +298,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v2, v2, v3, s12
; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_l_o_1d:
Expand All @@ -306,6 +319,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -331,6 +345,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3
; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: sample_c_l_o_2d:
Expand All @@ -351,6 +366,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -376,6 +392,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: gather4_l_2d:
Expand All @@ -396,6 +413,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 15, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -421,6 +439,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: gather4_c_l_2d:
Expand All @@ -441,6 +460,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -466,6 +486,7 @@ define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX9-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: gather4_l_o_2d:
Expand All @@ -486,6 +507,7 @@ define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -511,6 +533,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX9-NEXT: s_mov_b32 s11, s13
; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3
; GFX9-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: gather4_c_l_o_2d:
Expand All @@ -531,6 +554,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: s_mov_b32 s11, s13
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -39,6 +40,7 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -61,6 +63,7 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -83,6 +86,7 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -105,6 +109,7 @@ define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -127,6 +132,7 @@ define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -149,6 +155,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> i
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -171,6 +178,7 @@ define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -193,6 +201,7 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 15, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -215,6 +224,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -237,6 +247,7 @@ define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand All @@ -259,6 +270,7 @@ define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s13
; GCN-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
Expand Down Expand Up @@ -74,6 +75,7 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX7-UNALIGNED-NEXT: s_mov_b32 s7, 0xf000
; GFX7-UNALIGNED-NEXT: s_mov_b64 s[4:5], 0
; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
Expand Down Expand Up @@ -148,6 +150,7 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
Expand Down Expand Up @@ -184,6 +187,7 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX7-UNALIGNED-NEXT: s_mov_b32 s7, 0xf000
; GFX7-UNALIGNED-NEXT: s_mov_b64 s[4:5], 0
; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
Expand Down Expand Up @@ -227,6 +231,7 @@ define <3 x i32> @v_load_constant_v3i32_align4(<3 x i32> addrspace(4)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_constant_v3i32_align4:
Expand All @@ -236,6 +241,7 @@ define <3 x i32> @v_load_constant_v3i32_align4(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4
ret <3 x i32> %load
Expand All @@ -246,6 +252,7 @@ define i96 @v_load_constant_i96_align8(i96 addrspace(4)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_constant_i96_align8:
Expand All @@ -255,6 +262,7 @@ define i96 @v_load_constant_i96_align8(i96 addrspace(4)* %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load i96, i96 addrspace(4)* %ptr, align 8
ret i96 %load
Expand All @@ -265,6 +273,7 @@ define <3 x i32> @v_load_constant_v3i32_align8(<3 x i32> addrspace(4)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_constant_v3i32_align8:
Expand All @@ -274,6 +283,7 @@ define <3 x i32> @v_load_constant_v3i32_align8(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 8
ret <3 x i32> %load
Expand All @@ -284,6 +294,7 @@ define <6 x i16> @v_load_constant_v6i16_align8(<6 x i16> addrspace(4)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_constant_v6i16_align8:
Expand Down Expand Up @@ -357,6 +368,7 @@ define <3 x i32> @v_load_constant_v3i32_align16(<3 x i32> addrspace(4)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_load_constant_v3i32_align16:
Expand All @@ -366,6 +378,7 @@ define <3 x i32> @v_load_constant_v3i32_align16(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 16
ret <3 x i32> %load
Expand Down Expand Up @@ -678,6 +691,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)*
; GFX9-NEXT: s_mov_b32 s3, s1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_load_constant_v3i32_align4:
Expand All @@ -686,6 +700,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(<3 x i32> addrspace(4)*
; GFX7-NEXT: s_mov_b32 s3, s1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4
ret <3 x i32> %load
Expand All @@ -698,6 +713,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) {
; GFX9-NEXT: s_mov_b32 s3, s1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_load_constant_i96_align8:
Expand All @@ -706,6 +722,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(i96 addrspace(4)* inreg %ptr) {
; GFX7-NEXT: s_mov_b32 s3, s1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%load = load i96, i96 addrspace(4)* %ptr, align 8
ret i96 %load
Expand All @@ -718,6 +735,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)*
; GFX9-NEXT: s_mov_b32 s3, s1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_load_constant_v3i32_align8:
Expand All @@ -726,6 +744,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(<3 x i32> addrspace(4)*
; GFX7-NEXT: s_mov_b32 s3, s1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 8
ret <3 x i32> %load
Expand All @@ -738,6 +757,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)*
; GFX9-NEXT: s_mov_b32 s3, s1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_load_constant_v6i16_align8:
Expand All @@ -746,6 +766,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(<6 x i16> addrspace(4)*
; GFX7-NEXT: s_mov_b32 s3, s1
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%load = load <6 x i16>, <6 x i16> addrspace(4)* %ptr, align 8
%cast = bitcast <6 x i16> %load to <3 x i32>
Expand Down Expand Up @@ -796,6 +817,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(<3 x i32> addrspace(4)
; GCN-LABEL: s_load_constant_v3i32_align16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 16
ret <3 x i32> %load
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b128 v[0:3], v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
ret <4 x i32> %load
Expand Down Expand Up @@ -245,6 +247,7 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align4:
Expand All @@ -254,6 +257,7 @@ define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
ret <4 x i32> %load
Expand All @@ -264,13 +268,15 @@ define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align8:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
ret <4 x i32> %load
Expand All @@ -281,13 +287,15 @@ define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b128 v[0:3], v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
ret <4 x i32> %load
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b96 v[0:2], v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
ret <3 x i32> %load
Expand Down Expand Up @@ -203,6 +205,7 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align4:
Expand All @@ -212,6 +215,7 @@ define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
ret <3 x i32> %load
Expand All @@ -224,6 +228,7 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: ds_read_b64 v[0:1], v0
; GFX9-NEXT: ds_read_b32 v2, v2 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align8:
Expand All @@ -233,6 +238,7 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b64 v[0:1], v0
; GFX7-NEXT: ds_read_b32 v2, v2 offset:8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
ret <3 x i32> %load
Expand All @@ -243,13 +249,15 @@ define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: ds_read_b96 v[0:2], v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
ret <3 x i32> %load
Expand Down
6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v4i32_align1:
Expand Down Expand Up @@ -98,6 +99,7 @@ define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_read_b96 v[0:2], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: load_lds_v3i32_align1:
Expand Down Expand Up @@ -169,6 +171,7 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_write_b128 v0, v[1:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: store_lds_v4i32_align1:
Expand Down Expand Up @@ -203,6 +206,7 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX7-NEXT: ds_write_b8 v0, v1 offset:13
; GFX7-NEXT: ds_write_b8 v0, v2 offset:14
; GFX7-NEXT: ds_write_b8 v0, v3 offset:15
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
ret void
Expand All @@ -213,6 +217,7 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: ds_write_b96 v0, v[1:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: store_lds_v3i32_align1:
Expand Down Expand Up @@ -240,6 +245,7 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
; GFX7-NEXT: ds_write_b8 v0, v1 offset:9
; GFX7-NEXT: ds_write_b8 v0, v2 offset:10
; GFX7-NEXT: ds_write_b8 v0, v4 offset:11
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
ret void
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ define void @localize_internal_globals(i1 %cond) {
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: BB2_4: ; %bb2
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
entry:
br i1 %cond, label %bb0, label %bb1
Expand Down
42 changes: 42 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]

entry:
Expand Down Expand Up @@ -262,6 +263,7 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_sub_u32 s32, s32, 0x2000
; GCN-NEXT: s_mov_b32 s33, s8
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%cond = icmp eq i32 %arg.cond, 0
Expand Down
15 changes: 15 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,15 @@ define i64 @zextload_global_i8_to_i64(i8 addrspace(1)* %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i8_to_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: zextload_global_i8_to_i64:
Expand All @@ -61,6 +63,7 @@ define i64 @zextload_global_i8_to_i64(i8 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i8, i8 addrspace(1)* %ptr
%ext = zext i8 %load to i64
Expand All @@ -73,13 +76,15 @@ define i64 @zextload_global_i16_to_i64(i16 addrspace(1)* %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i16_to_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: zextload_global_i16_to_i64:
Expand All @@ -90,6 +95,7 @@ define i64 @zextload_global_i16_to_i64(i16 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i16, i16 addrspace(1)* %ptr
%ext = zext i16 %load to i64
Expand All @@ -102,13 +108,15 @@ define i64 @zextload_global_i32_to_i64(i32 addrspace(1)* %ptr) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i32_to_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: zextload_global_i32_to_i64:
Expand All @@ -119,6 +127,7 @@ define i64 @zextload_global_i32_to_i64(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i64
Expand All @@ -132,6 +141,7 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i32_to_i96:
Expand All @@ -140,6 +150,7 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: zextload_global_i32_to_i96:
Expand All @@ -151,6 +162,7 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i96
Expand All @@ -165,6 +177,7 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i32_to_i128:
Expand All @@ -174,6 +187,7 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: zextload_global_i32_to_i128:
Expand All @@ -186,6 +200,7 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
%ext = zext i32 %load to i128
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_load_local.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b32 v0, v0{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) {
%load = load atomic i32, i32 addrspace(3)* %ptr monotonic, align 4
Expand All @@ -17,6 +18,7 @@ define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
Expand All @@ -29,6 +31,7 @@ define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) {
%load = load atomic i64, i64 addrspace(3)* %ptr monotonic, align 8
Expand All @@ -40,6 +43,7 @@ define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) {
%gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i32 16
Expand All @@ -52,6 +56,7 @@ define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define float @atomic_load_monotonic_f32_offset(float addrspace(3)* %ptr) {
%gep = getelementptr inbounds float, float addrspace(3)* %ptr, i32 16
Expand All @@ -64,6 +69,7 @@ define float @atomic_load_monotonic_f32_offset(float addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define double @atomic_load_monotonic_f64_offset(double addrspace(3)* %ptr) {
%gep = getelementptr inbounds double, double addrspace(3)* %ptr, i32 16
Expand All @@ -76,6 +82,7 @@ define double @atomic_load_monotonic_f64_offset(double addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i8* @atomic_load_monotonic_p0i8_offset(i8* addrspace(3)* %ptr) {
%gep = getelementptr inbounds i8*, i8* addrspace(3)* %ptr, i32 16
Expand All @@ -88,6 +95,7 @@ define i8* @atomic_load_monotonic_p0i8_offset(i8* addrspace(3)* %ptr) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define i8 addrspace(3)* @atomic_load_monotonic_p3i8_offset(i8 addrspace(3)* addrspace(3)* %ptr) {
%gep = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %ptr, i32 16
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_store_local.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_write_b32 v0, v1{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) {
store atomic i32 %val, i32 addrspace(3)* %ptr monotonic, align 4
Expand All @@ -17,6 +18,7 @@ define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val) {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16
Expand All @@ -29,6 +31,7 @@ define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val)
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) {
store atomic i64 %val, i64 addrspace(3)* %ptr monotonic, align 8
Expand All @@ -40,9 +43,11 @@ define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) {
; GFX9-NOT: s_mov_b32 m0
; CI-NEXT: s_mov_b32 m0
; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}}
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @atomic_store_monotonic_offset_i64(i64 addrspace(3)* %ptr, i64 %val) {
%gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16
store atomic i64 %val, i64 addrspace(3)* %gep monotonic, align 8
ret void
}

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
; GCN: v_mov_b32_e32 v0, 42
; GCN: s_swappc_b64 s[30:31],
; GCN: s_waitcnt
; GCN-NOT: s_waitcnt
; GCN: buffer_store_dword v0, off, s[36:39], 0
define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
%val = call i32 @external_i32_func_i32(i32 42)
Expand Down
6 changes: 0 additions & 6 deletions llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ declare hidden void @external_void_func_void() #0
; GCN-NEXT: s_addc_u32 s35, s35,
; GCN-NEXT: s_mov_b32 s32, 0
; GCN: s_swappc_b64 s[30:31], s[34:35]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

; GCN-NEXT: #ASMSTART
; GCN-NEXT: #ASMEND
Expand All @@ -31,7 +30,6 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN: v_writelane_b32 v40, s31, 3

; GCN: s_swappc_b64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
Expand All @@ -57,7 +55,6 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; GCN: s_mov_b32 s33, s32
; GCN: s_add_u32 s32, s32, 0x400
; GCN: s_swappc_b64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_swappc_b64

; GCN: v_readlane_b32 s33, v40, 4
Expand Down Expand Up @@ -109,7 +106,6 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
; GCN: s_mov_b32 s33, s31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s31, s33
define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 {
%s31 = call i32 asm sideeffect "; def $0", "={s31}"()
Expand All @@ -121,7 +117,6 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
Expand Down Expand Up @@ -168,7 +163,6 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(

; GCN-NOT: s34
; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

; GCN-NOT: s34

Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NEXT: v_mov_b32_e32 v1, s35
; GCN-NEXT: global_store_dword v[0:1], v40, off
Expand All @@ -90,7 +89,6 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s34
; GCN-NEXT: v_mov_b32_e32 v2, s35
; GCN-NEXT: global_store_dword v[1:2], v0, off
Expand Down
17 changes: 15 additions & 2 deletions llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ define void @callee_no_stack_no_fp_elim_nonleaf() #2 {
; GCN-NEXT: s_waitcnt
; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack() #0 {
%alloca = alloca i32, addrspace(5)
Expand All @@ -52,6 +53,7 @@ define void @callee_with_stack() #0 {
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
; GCN-NEXT: s_sub_u32 s32, s32, 0x200
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_all() #1 {
%alloca = alloca i32, addrspace(5)
Expand All @@ -64,6 +66,7 @@ define void @callee_with_stack_no_fp_elim_all() #1 {
; GCN-NEXT: s_waitcnt
; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -96,6 +99,8 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)

; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_and_call() #0 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -130,6 +135,7 @@ define void @callee_with_stack_and_call() #0 {
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_with_call() #0 {
call void @external_void_func_void()
Expand All @@ -155,6 +161,7 @@ declare hidden void @external_void_func_void() #0
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
Expand Down Expand Up @@ -212,6 +219,7 @@ define void @spill_only_csr_sgpr() {
; GCN: s_add_u32 s32, s32, 0x300
; GCN-NEXT: s_sub_u32 s32, s32, 0x300
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
%alloca = alloca i32, addrspace(5)
Expand All @@ -234,6 +242,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; GCN: s_add_u32 s32, s32, 0x300
; GCN-NEXT: s_sub_u32 s32, s32, 0x300
; GCN-NEXT: v_readlane_b32 s33, v1, 63
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @last_lane_vgpr_for_fp_csr() #1 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -267,6 +276,7 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; GCN: s_add_u32 s32, s32, 0x300
; GCN-NEXT: s_sub_u32 s32, s32, 0x300
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @no_new_vgpr_for_fp_csr() #1 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -294,6 +304,7 @@ define void @no_new_vgpr_for_fp_csr() #1 {
; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33
; GCN-NEXT: s_sub_u32 s32, s32, 0x100000
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @realign_stack_no_fp_elim() #1 {
%alloca = alloca i32, align 8192, addrspace(5)
Expand All @@ -315,6 +326,7 @@ define void @realign_stack_no_fp_elim() #1 {
; GCN-NEXT: v_readlane_b32 s5, v1, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x200
; GCN-NEXT: v_readlane_b32 s33, v1, 2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
define void @no_unused_non_csr_sgpr_for_fp() #1 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -353,6 +365,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
%alloca = alloca i32, addrspace(5)
Expand Down Expand Up @@ -399,6 +412,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #1 {
%alloca = alloca i32, addrspace(5)
Expand All @@ -422,8 +436,7 @@ define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval ali
}

; GCN-LABEL: {{^}}local_empty_func:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
define internal void @local_empty_func() #0 {
ret void
Expand Down
16 changes: 13 additions & 3 deletions llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_x() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -18,6 +19,7 @@ define void @use_workitem_id_x() #1 {
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_y() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.y()
Expand All @@ -30,6 +32,7 @@ define void @use_workitem_id_y() #1 {
; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_z() #1 {
%val = call i32 @llvm.amdgcn.workitem.id.z()
Expand All @@ -47,6 +50,7 @@ define void @use_workitem_id_z() #1 {

; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xy() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -71,6 +75,7 @@ define void @use_workitem_id_xy() #1 {
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xyz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -92,6 +97,7 @@ define void @use_workitem_id_xyz() #1 {

; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_xz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -111,6 +117,7 @@ define void @use_workitem_id_xz() #1 {

; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @use_workitem_id_yz() #1 {
%val0 = call i32 @llvm.amdgcn.workitem.id.y()
Expand Down Expand Up @@ -393,7 +400,8 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
; VARABI: v_and_b32_e32 v32, 0x3ff, v32
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32

; VARABI: s_setpc_b64
; VARABI: s_waitcnt
; VARABI-NEXT: s_setpc_b64

; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
Expand Down Expand Up @@ -700,7 +708,8 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]

; VARABI: s_setpc_b64
; VARABI: s_waitcnt
; VARABI-NEXT: s_setpc_b64


; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
Expand Down Expand Up @@ -804,7 +813,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]

; GCN: s_setpc_b64
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN: ScratchSize: 0
define void @too_many_args_use_workitem_id_x_stack_yz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ define <2 x half> @chain_hi_to_lo_private() {
; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
Expand All @@ -28,6 +29,7 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %ba
; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(5)* %base_lo
Expand Down Expand Up @@ -66,6 +68,7 @@ define <2 x half> @chain_hi_to_lo_group() {
; GCN-NEXT: ds_read_u16 v0, v1 offset:2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_u16_d16_hi v0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
Expand All @@ -86,6 +89,7 @@ define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base
; GCN-NEXT: ds_read_u16 v0, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_u16_d16_hi v0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(3)* %base_lo
Expand All @@ -108,6 +112,7 @@ define <2 x half> @chain_hi_to_lo_global() {
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
Expand All @@ -128,6 +133,7 @@ define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %bas
; GCN-NEXT: global_load_ushort v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half addrspace(1)* %base_lo
Expand All @@ -150,6 +156,7 @@ define <2 x half> @chain_hi_to_lo_flat() {
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, half* null, i64 1
Expand All @@ -170,6 +177,7 @@ define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %ba
; GCN-NEXT: flat_load_ushort v0, v[0:1]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, half* %base_lo
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <2 x float> @func_v2f32()
Expand All @@ -73,14 +73,14 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <3 x float> @func_v3f32()
Expand All @@ -107,14 +107,14 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call <4 x half> @func_v4f16()
Expand All @@ -141,7 +141,6 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: v_mov_b32_e32 v1, v4
Expand All @@ -150,6 +149,7 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
bb0:
%split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
Expand Down Expand Up @@ -184,7 +184,6 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_branch BB4_3
; GCN-NEXT: BB4_2:
; GCN-NEXT: s_mov_b32 s4, 0
Expand Down Expand Up @@ -230,7 +229,6 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_branch BB5_3
; GCN-NEXT: BB5_2:
; GCN-NEXT: s_mov_b32 s4, 0
Expand Down
30 changes: 21 additions & 9 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,27 @@ define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
}

define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
; GCN-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GCN-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
%lshr.8 = lshr i32 %arg0, 8
store i32 %lshr.8, i32 addrspace(1)* undef
%masked = and i32 %lshr.8, 255
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ define i32 @global_load_2xi16_align1(i16 addrspace(1)* %p) #0 {
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1]
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_load_2xi16_align1:
Expand Down Expand Up @@ -217,12 +218,14 @@ define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 {
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: flat_load_dword v0, v[0:1]
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1]
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_load_2xi16_align4:
Expand Down
10 changes: 10 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2:
Expand All @@ -65,6 +66,7 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align2:
Expand All @@ -74,6 +76,7 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX9-NEXT: v_mov_b32_e32 v0, 2
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
store i16 1, i16 addrspace(5)* %r, align 2
Expand Down Expand Up @@ -108,6 +111,7 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align1:
Expand Down Expand Up @@ -145,20 +149,23 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
store i16 1, i16 addrspace(5)* %r, align 1
Expand All @@ -179,12 +186,14 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align4:
Expand Down Expand Up @@ -224,6 +233,7 @@ define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)*
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x20001
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
store i16 1, i16 addrspace(5)* %r, align 4
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/function-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
%arg0.0 = extractelement <3 x float> %arg0, i32 0
Expand All @@ -781,6 +782,7 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
%arg0.0 = extractelement <3 x i32> %arg0, i32 0
Expand Down
43 changes: 43 additions & 0 deletions llvm/test/CodeGen/AMDGPU/function-returns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

; GCN-LABEL: {{^}}i1_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define i1 @i1_func_void() #0 {
%val = load i1, i1 addrspace(1)* undef
Expand All @@ -13,6 +14,7 @@ define i1 @i1_func_void() #0 {
; FIXME: Missing and?
; GCN-LABEL: {{^}}i1_zeroext_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i1 @i1_zeroext_func_void() #0 {
%val = load i1, i1 addrspace(1)* undef
Expand All @@ -31,6 +33,7 @@ define signext i1 @i1_signext_func_void() #0 {

; GCN-LABEL: {{^}}i8_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i8 @i8_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
Expand All @@ -39,6 +42,7 @@ define i8 @i8_func_void() #0 {

; GCN-LABEL: {{^}}i8_zeroext_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i8 @i8_zeroext_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
Expand All @@ -47,6 +51,7 @@ define zeroext i8 @i8_zeroext_func_void() #0 {

; GCN-LABEL: {{^}}i8_signext_func_void:
; GCN: buffer_load_sbyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define signext i8 @i8_signext_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
Expand All @@ -55,6 +60,7 @@ define signext i8 @i8_signext_func_void() #0 {

; GCN-LABEL: {{^}}i16_func_void:
; GCN: buffer_load_ushort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i16 @i16_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
Expand All @@ -63,6 +69,7 @@ define i16 @i16_func_void() #0 {

; GCN-LABEL: {{^}}i16_zeroext_func_void:
; GCN: buffer_load_ushort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i16 @i16_zeroext_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
Expand All @@ -71,6 +78,7 @@ define zeroext i16 @i16_zeroext_func_void() #0 {

; GCN-LABEL: {{^}}i16_signext_func_void:
; GCN: buffer_load_sshort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define signext i16 @i16_signext_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
Expand All @@ -79,6 +87,7 @@ define signext i16 @i16_signext_func_void() #0 {

; GCN-LABEL: {{^}}i32_func_void:
; GCN: buffer_load_dword v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i32 @i32_func_void() #0 {
%val = load i32, i32 addrspace(1)* undef
Expand All @@ -88,6 +97,7 @@ define i32 @i32_func_void() #0 {
; GCN-LABEL: {{^}}i48_func_void:
; GCN: buffer_load_dword v0, off
; GCN-NEXT: buffer_load_ushort v1, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i48 @i48_func_void() #0 {
%val = load i48, i48 addrspace(1)* undef, align 8
Expand All @@ -97,6 +107,7 @@ define i48 @i48_func_void() #0 {
; GCN-LABEL: {{^}}i48_zeroext_func_void:
; GCN: buffer_load_dword v0, off
; GCN-NEXT: buffer_load_ushort v1, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i48 @i48_zeroext_func_void() #0 {
%val = load i48, i48 addrspace(1)* undef, align 8
Expand All @@ -106,13 +117,16 @@ define zeroext i48 @i48_zeroext_func_void() #0 {
; GCN-LABEL: {{^}}i48_signext_func_void:
; GCN: buffer_load_dword v0, off
; GCN-NEXT: buffer_load_sshort v1, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define signext i48 @i48_signext_func_void() #0 {
%val = load i48, i48 addrspace(1)* undef, align 8
ret i48 %val
}

; GCN-LABEL: {{^}}i63_func_void:
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
define i63 @i63_func_void(i63 %val) #0 {
ret i63 %val
}
Expand Down Expand Up @@ -140,6 +154,7 @@ define signext i63 @i63_signext_func_void(i63 %val) #0 {

; GCN-LABEL: {{^}}i64_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i64 @i64_func_void() #0 {
%val = load i64, i64 addrspace(1)* undef
Expand All @@ -149,6 +164,7 @@ define i64 @i64_func_void() #0 {
; GCN-LABEL: {{^}}i65_func_void:
; GCN-DAG: buffer_load_dwordx2 v[0:1], off
; GCN-DAG: buffer_load_ubyte v2, off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i65 @i65_func_void() #0 {
%val = load i65, i65 addrspace(1)* undef
Expand All @@ -157,6 +173,7 @@ define i65 @i65_func_void() #0 {

; GCN-LABEL: {{^}}f32_func_void:
; GCN: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define float @f32_func_void() #0 {
%val = load float, float addrspace(1)* undef
Expand All @@ -165,6 +182,7 @@ define float @f32_func_void() #0 {

; GCN-LABEL: {{^}}f64_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define double @f64_func_void() #0 {
%val = load double, double addrspace(1)* undef
Expand All @@ -173,6 +191,7 @@ define double @f64_func_void() #0 {

; GCN-LABEL: {{^}}v2f64_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <2 x double> @v2f64_func_void() #0 {
%val = load <2 x double>, <2 x double> addrspace(1)* undef
Expand All @@ -181,6 +200,7 @@ define <2 x double> @v2f64_func_void() #0 {

; GCN-LABEL: {{^}}v2i32_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <2 x i32> @v2i32_func_void() #0 {
%val = load <2 x i32>, <2 x i32> addrspace(1)* undef
Expand All @@ -189,6 +209,7 @@ define <2 x i32> @v2i32_func_void() #0 {

; GCN-LABEL: {{^}}v3i32_func_void:
; GCN: buffer_load_dwordx3 v[0:2], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i32> @v3i32_func_void() #0 {
%val = load <3 x i32>, <3 x i32> addrspace(1)* undef
Expand All @@ -197,6 +218,7 @@ define <3 x i32> @v3i32_func_void() #0 {

; GCN-LABEL: {{^}}v4i32_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <4 x i32> @v4i32_func_void() #0 {
%val = load <4 x i32>, <4 x i32> addrspace(1)* undef
Expand All @@ -206,6 +228,7 @@ define <4 x i32> @v4i32_func_void() #0 {
; GCN-LABEL: {{^}}v5i32_func_void:
; GCN-DAG: buffer_load_dword v4, off
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i32> @v5i32_func_void() #0 {
%val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
Expand All @@ -215,6 +238,7 @@ define <5 x i32> @v5i32_func_void() #0 {
; GCN-LABEL: {{^}}v8i32_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <8 x i32> @v8i32_func_void() #0 {
%ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
Expand All @@ -227,6 +251,7 @@ define <8 x i32> @v8i32_func_void() #0 {
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <16 x i32> @v16i32_func_void() #0 {
%ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
Expand All @@ -243,6 +268,7 @@ define <16 x i32> @v16i32_func_void() #0 {
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <32 x i32> @v32i32_func_void() #0 {
%ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
Expand All @@ -252,6 +278,7 @@ define <32 x i32> @v32i32_func_void() #0 {

; GCN-LABEL: {{^}}v2i64_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <2 x i64> @v2i64_func_void() #0 {
%val = load <2 x i64>, <2 x i64> addrspace(1)* undef
Expand All @@ -261,6 +288,7 @@ define <2 x i64> @v2i64_func_void() #0 {
; GCN-LABEL: {{^}}v3i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i64> @v3i64_func_void() #0 {
%ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef
Expand All @@ -271,6 +299,7 @@ define <3 x i64> @v3i64_func_void() #0 {
; GCN-LABEL: {{^}}v4i64_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN: buffer_load_dwordx4 v[4:7], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <4 x i64> @v4i64_func_void() #0 {
%ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef
Expand All @@ -282,6 +311,7 @@ define <4 x i64> @v4i64_func_void() #0 {
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i64> @v5i64_func_void() #0 {
%ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef
Expand All @@ -294,6 +324,7 @@ define <5 x i64> @v5i64_func_void() #0 {
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <8 x i64> @v8i64_func_void() #0 {
%ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef
Expand All @@ -310,6 +341,7 @@ define <8 x i64> @v8i64_func_void() #0 {
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <16 x i64> @v16i64_func_void() #0 {
%ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef
Expand All @@ -319,6 +351,7 @@ define <16 x i64> @v16i64_func_void() #0 {

; GCN-LABEL: {{^}}v2i16_func_void:
; GFX9: buffer_load_dword v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <2 x i16> @v2i16_func_void() #0 {
%val = load <2 x i16>, <2 x i16> addrspace(1)* undef
Expand All @@ -327,6 +360,7 @@ define <2 x i16> @v2i16_func_void() #0 {

; GCN-LABEL: {{^}}v3i16_func_void:
; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <3 x i16> @v3i16_func_void() #0 {
%val = load <3 x i16>, <3 x i16> addrspace(1)* undef
Expand All @@ -335,6 +369,7 @@ define <3 x i16> @v3i16_func_void() #0 {

; GCN-LABEL: {{^}}v4i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <4 x i16> @v4i16_func_void() #0 {
%val = load <4 x i16>, <4 x i16> addrspace(1)* undef
Expand All @@ -343,6 +378,7 @@ define <4 x i16> @v4i16_func_void() #0 {

; GCN-LABEL: {{^}}v4f16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <4 x half> @v4f16_func_void() #0 {
%val = load <4 x half>, <4 x half> addrspace(1)* undef
Expand All @@ -354,6 +390,7 @@ define <4 x half> @v4f16_func_void() #0 {
; GCN-LABEL: {{^}}v5i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1]
; GFX9-NEXT: global_load_short_d16 v2
; GFX9-NEXT: s_waitcnt
; GFX9-NEXT: s_setpc_b64
define <5 x i16> @v5i16_func_void() #0 {
%ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
Expand All @@ -363,6 +400,7 @@ define <5 x i16> @v5i16_func_void() #0 {

; GCN-LABEL: {{^}}v8i16_func_void:
; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <8 x i16> @v8i16_func_void() #0 {
%ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef
Expand All @@ -373,6 +411,7 @@ define <8 x i16> @v8i16_func_void() #0 {
; GCN-LABEL: {{^}}v16i16_func_void:
; GFX9: buffer_load_dwordx4 v[0:3], off
; GFX9: buffer_load_dwordx4 v[4:7], off
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <16 x i16> @v16i16_func_void() #0 {
%ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef
Expand Down Expand Up @@ -408,6 +447,7 @@ define <4 x i8> @v4i8_func_void() #0 {
; GCN-LABEL: {{^}}struct_i8_i32_func_void:
; GCN-DAG: buffer_load_dword v1
; GCN-DAG: buffer_load_ubyte v0
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define {i8, i32} @struct_i8_i32_func_void() #0 {
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
Expand Down Expand Up @@ -467,6 +507,7 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0)
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <33 x i32> @v33i32_func_void() #0 {
%ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef
Expand Down Expand Up @@ -508,6 +549,7 @@ define <33 x i32> @v33i32_func_void() #0 {
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
%ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef
Expand Down Expand Up @@ -549,6 +591,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}}
; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}}
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
%ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase,
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc
; GCN-NEXT: ; implicit-def: $vcc_hi
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
Expand All @@ -24,6 +25,7 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
; GCN-NEXT: ; implicit-def: $vcc_hi
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
Expand Down
Loading