150 changes: 80 additions & 70 deletions llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-LABEL: non_preserved_vgpr_tuple8:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v45, s33, 0
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
; GFX9-NEXT: v_mov_b32_e32 v34, v14
Expand All @@ -36,6 +36,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v45, s4, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
Expand All @@ -54,31 +55,32 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v45, 0
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xf800
; GFX9-NEXT: v_readlane_b32 s33, v45, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: non_preserved_vgpr_tuple8:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_mov_b32 s4, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v36, v16
; GFX10-NEXT: v_mov_b32_e32 v35, v15
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
; GFX10-NEXT: v_writelane_b32 v45, s33, 0
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand All @@ -93,7 +95,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: v_writelane_b32 v45, s4, 0
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
Expand All @@ -113,31 +115,32 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
; GFX10-NEXT: v_readlane_b32 s33, v45, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: v_readlane_b32 s4, v45, 0
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16
; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16
; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 exec_lo, s5
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
; GFX10-NEXT: s_mov_b32 s33, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: non_preserved_vgpr_tuple8:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s0, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16
; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16
; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
; GFX11-NEXT: v_mov_b32_e32 v32, v12
; GFX11-NEXT: v_writelane_b32 v45, s33, 0
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8
Expand All @@ -153,6 +156,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: v_writelane_b32 v45, s0, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
Expand All @@ -170,13 +174,14 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
; GFX11-NEXT: v_readlane_b32 s33, v45, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_readlane_b32 s0, v45, 0
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16
; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
; GFX11-NEXT: s_mov_b32 s33, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]

Expand Down Expand Up @@ -207,12 +212,12 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-LABEL: call_preserved_vgpr_tuple8:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v46, s33, 0
; GFX9-NEXT: s_mov_b32 s4, s33
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
Expand All @@ -225,6 +230,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: v_mov_b32_e32 v41, v12
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v46, s4, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
Expand All @@ -244,34 +250,35 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v46, 0
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xf800
; GFX9-NEXT: v_readlane_b32 s33, v46, 0
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: s_mov_b32 s33, s4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: call_preserved_vgpr_tuple8:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: v_writelane_b32 v46, s33, 0
; GFX10-NEXT: s_mov_b32 s4, s33
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s5
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: v_writelane_b32 v46, s4, 0
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
Expand All @@ -296,28 +303,29 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
; GFX10-NEXT: v_readlane_b32 s33, v46, 0
; GFX10-NEXT: s_or_saveexec_b32 s4, -1
; GFX10-NEXT: v_readlane_b32 s4, v46, 0
; GFX10-NEXT: s_or_saveexec_b32 s5, -1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20
; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20
; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_mov_b32 exec_lo, s5
; GFX10-NEXT: s_addk_i32 s32, 0xfc00
; GFX10-NEXT: s_mov_b32 s33, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: call_preserved_vgpr_tuple8:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20
; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v46, s33, 0
; GFX11-NEXT: s_mov_b32 s0, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20
; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_clause 0x4
; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16
; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12
Expand All @@ -326,6 +334,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_store_b32 off, v45, s33
; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_add_i32 s32, s32, 32
; GFX11-NEXT: v_writelane_b32 v46, s0, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
Expand All @@ -348,13 +357,14 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
; GFX11-NEXT: v_readlane_b32 s33, v46, 0
; GFX11-NEXT: s_or_saveexec_b32 s0, -1
; GFX11-NEXT: v_readlane_b32 s0, v46, 0
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20
; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: s_addk_i32 s32, 0xffe0
; GFX11-NEXT: s_mov_b32 s33, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]

Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/wave32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1108,39 +1108,39 @@ declare void @external_void_func_void() #1
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
; GCN-NEXT: s_waitcnt_vscnt

; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]+]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]

; GCN-NEXT: v_writelane_b32 v41, s33, 0
; GCN: s_mov_b32 s33, s32
; GFX1064: s_addk_i32 s32, 0x400
; GFX1032: s_addk_i32 s32, 0x200

; GCN-NEXT: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0

; GCN-DAG: v_writelane_b32 v40, s30, 0

; GCN-DAG: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s30, v40, 0
; GCN-DAG: v_readlane_b32 s31, v40, 1


; GFX1064: s_addk_i32 s32, 0xfc00
; GFX1032: s_addk_i32 s32, 0xfe00
; GCN: v_readlane_b32 s33, v41, 0
; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4
; GCN-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
; GFX1064: s_addk_i32 s32, 0xfc00
; GFX1032: s_addk_i32 s32, 0xfe00
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_with_call() #1 {
Expand Down
128 changes: 64 additions & 64 deletions llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,13 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-LABEL: strict_wwm_call:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_mov_b32 s35, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
Expand Down Expand Up @@ -370,27 +370,27 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
; GFX9-O0-NEXT: s_mov_b32 s33, s35
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O3-LABEL: strict_wwm_call:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_mov_b32 s38, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O3-NEXT: s_mov_b32 s38, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
Expand All @@ -410,13 +410,13 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg)
; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1
; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-O3-NEXT: s_mov_b32 s33, s38
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
; GFX9-O3-NEXT: s_mov_b32 s33, s38
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
Expand Down Expand Up @@ -516,24 +516,24 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-LABEL: strict_wwm_call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_mov_b32 s42, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_mov_b32 s42, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
Expand Down Expand Up @@ -598,50 +598,50 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1
; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400
; GFX9-O0-NEXT: s_mov_b32 s33, s42
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400
; GFX9-O0-NEXT: s_mov_b32 s33, s42
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O3-LABEL: strict_wwm_call_i64:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_mov_b32 s40, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0
; GFX9-O3-NEXT: s_mov_b32 s40, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
Expand Down Expand Up @@ -671,22 +671,22 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a
; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1
; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
; GFX9-O3-NEXT: s_mov_b32 s33, s40
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
; GFX9-O3-NEXT: s_mov_b32 s33, s40
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
Expand Down