144 changes: 72 additions & 72 deletions llvm/test/CodeGen/AMDGPU/load-global-i16.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/load-global-i32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3031,11 +3031,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29
; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30
; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31
; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7
; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6
; SI-NOHSA-NEXT: s_waitcnt expcnt(0)
Expand Down Expand Up @@ -3090,10 +3090,10 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; SI-NOHSA-NEXT: s_waitcnt vmcnt(0)
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
Expand Down Expand Up @@ -3611,11 +3611,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[8:11], 0 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10
Expand Down Expand Up @@ -3654,11 +3654,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-NEXT: s_add_u32 s16, s16, s3
; GCN-NEXT: s_addc_u32 s17, s17, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000
; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_brev_b32 s0, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -401,7 +401,7 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4
; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0
; GCN-NEXT: s_nop 0
; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
; GCN-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -421,11 +421,11 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
Expand Down Expand Up @@ -460,15 +460,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
; GCN-NEXT: s_add_u32 s4, s4, s3
; GCN-NEXT: s_addc_u32 s5, s5, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x40d00000
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:8
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: exp mrt0 v0, off, off, off done vm
Expand All @@ -482,15 +482,15 @@ define amdgpu_kernel void @flat_scratch_load_clause(float %a, float %b, <8 x i32
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40d00000
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: scratch_store_dword off, v1, off offset:8
; GCN-SCRATCH-NEXT: scratch_store_dword off, v1, off offset:4
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: scratch_load_dword v0, off, off offset:4
; GCN-SCRATCH-NEXT: scratch_load_dword v1, off, off offset:8
; GCN-SCRATCH-NEXT: scratch_load_dword v0, off, off
; GCN-SCRATCH-NEXT: scratch_load_dword v1, off, off offset:4
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-SCRATCH-NEXT: exp mrt0 v0, off, off, off done vm
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 {
; so a possibly negative base index can't be used for the vgpr offset.

; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr:
; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4
; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 0
; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]]
; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}

; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4,
; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 0,
; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32
define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 {
%alloca = alloca [16 x i32], align 4, addrspace(5)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
Expand All @@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
Expand Down Expand Up @@ -86,15 +86,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
Expand Down
62 changes: 31 additions & 31 deletions llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: v_writelane_b32 v2, s10, 62
; GCN-NEXT: v_writelane_b32 v2, s11, 63
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[4:11]
Expand Down Expand Up @@ -201,7 +201,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: v_writelane_b32 v1, s10, 62
; GCN-NEXT: v_writelane_b32 v1, s11, 63
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[4:11]
Expand All @@ -215,18 +215,18 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: v_writelane_b32 v0, s10, 6
; GCN-NEXT: v_writelane_b32 v0, s11, 7
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, s1
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_readlane_b32 s8, v2, 56
Expand Down Expand Up @@ -319,7 +319,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: v_readlane_b32 s6, v1, 6
; GCN-NEXT: v_readlane_b32 s7, v1, 7
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[0:7]
Expand Down Expand Up @@ -423,13 +423,13 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB0_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ; kill: killed $vgpr2
; GCN-NEXT: ; kill: killed $vgpr1
Expand Down Expand Up @@ -570,7 +570,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_writelane_b32 v1, s18, 62
; GCN-NEXT: v_writelane_b32 v1, s19, 63
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[4:11]
Expand All @@ -589,18 +589,18 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_writelane_b32 v0, s2, 8
; GCN-NEXT: v_writelane_b32 v0, s3, 9
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, s1
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_readlane_b32 s16, v1, 8
Expand Down Expand Up @@ -698,10 +698,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB1_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[28:29]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
Expand Down Expand Up @@ -747,10 +747,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -840,7 +840,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_writelane_b32 v1, s18, 62
; GCN-NEXT: v_writelane_b32 v1, s19, 63
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[2:3]
Expand All @@ -849,15 +849,15 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_writelane_b32 v0, s2, 0
; GCN-NEXT: v_writelane_b32 v0, s3, 1
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, s1
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s36, v1, 32
Expand Down Expand Up @@ -909,7 +909,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: v_readlane_b32 s30, v1, 14
; GCN-NEXT: v_readlane_b32 s31, v1, 15
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[16:31]
Expand Down Expand Up @@ -947,10 +947,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB2_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
Expand Down Expand Up @@ -999,10 +999,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -1092,7 +1092,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: v_writelane_b32 v1, s18, 62
; GCN-NEXT: v_writelane_b32 v1, s19, 63
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[2:3]
Expand All @@ -1101,15 +1101,15 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: v_writelane_b32 v0, s2, 0
; GCN-NEXT: v_writelane_b32 v0, s3, 1
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, s1
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s36, v2, 32
Expand Down Expand Up @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: v_readlane_b32 s30, v2, 14
; GCN-NEXT: v_readlane_b32 s31, v2, 15
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def v0
Expand Down Expand Up @@ -1205,10 +1205,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB3_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[34:35]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ body: |
; GCN-LABEL: name: preserve_active_lanes_above_args
; GCN: liveins: $sgpr0, $vgpr8, $vgpr9, $vgpr10
; GCN-NEXT: {{ $}}
; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_ST 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
$vgpr8 = COPY renamable killed $vgpr10
Expand Down Expand Up @@ -70,8 +70,8 @@ body: |
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
Expand All @@ -81,8 +81,8 @@ body: |
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
Expand Down Expand Up @@ -142,8 +142,8 @@ body: |
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
Expand All @@ -152,8 +152,8 @@ body: |
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ body: |
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
Expand All @@ -72,8 +72,8 @@ body: |
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
Expand All @@ -86,8 +86,8 @@ body: |
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
Expand Down
136 changes: 68 additions & 68 deletions llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/scratch-simple.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@
; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
; GFX11-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0

; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x200, [[CLAMP_IDX]]
; MUBUF-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]
; FLATSCR: v_add{{_|_nc_}}{{u32|b32}}_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0}}, [[CLAMP_IDX]]

; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off offset:128
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
define amdgpu_ps float @ps_main(i32 %idx) {
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -106,7 +106,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: v_writelane_b32 v1, s22, 62
; GCN-NEXT: v_writelane_b32 v1, s23, 63
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[6:7]
Expand All @@ -115,15 +115,15 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: v_writelane_b32 v0, s6, 0
; GCN-NEXT: v_writelane_b32 v0, s7, 1
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, s5
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v1, 0
Expand All @@ -143,7 +143,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: v_readlane_b32 s18, v1, 14
; GCN-NEXT: v_readlane_b32 s19, v1, 15
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
Expand Down Expand Up @@ -213,10 +213,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB0_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
Expand Down
240 changes: 120 additions & 120 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill.mir

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
; Make sure we are handling hazards correctly.
; SGPR: v_mov_b32_e32 v0, vcc_lo
; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1
; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload
; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 ; 4-byte Folded Reload
; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]]
; SGPR-NEXT: s_waitcnt vmcnt(0)
; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/spill-agpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -75,31 +75,31 @@ use:
; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1

; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill

; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill

; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]

; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off

; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:16 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off

; GCN: ScratchSize: 20
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/spill-m0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
; TOVMEM: s_mov_b64 [[COPY_EXEC:s\[[0-9]+:[0-9]+\]]], exec
; TOVMEM: s_mov_b64 exec, 1
; TOVMEM: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Spill
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill
; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]]

; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
Expand All @@ -26,7 +26,7 @@
; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]]
; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]

; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 ; 4-byte Folded Reload
; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Reload
; TOVMEM: s_waitcnt vmcnt(0)
; TOVMEM: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]], 0
; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
; MUBUF: ; %bb.0: ; %entry
; MUBUF-NEXT: s_add_u32 s0, s0, s7
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_endpgm
;
Expand All @@ -27,16 +27,16 @@ define amdgpu_kernel void @test_inst_offset_kernel() {
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:4
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -277,19 +277,19 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; MUBUF: ; %bb.0: ; %entry
; MUBUF-NEXT: s_add_u32 s0, s0, s7
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Spill
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ;;#ASMEND
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4084 ; 4-byte Folded Reload
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; v[0:1]
Expand All @@ -301,16 +301,16 @@ define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
; FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ;;#ASMEND
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
; FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: ;;#ASMSTART
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10315,8 +10315,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: s_mov_b32 s34, 0x84800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s34, 0x84800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -10351,8 +10351,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: s_mov_b32 s34, 0x85000
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s34, 0x85000
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -10387,8 +10387,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: s_mov_b32 s34, 0x85800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s34, 0x85800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -10431,8 +10431,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[0:1]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 0xff
; GFX6-NEXT: s_mov_b32 s36, 0x86000
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s36, 0x86000
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -10449,8 +10449,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[34:35]
; GFX6-NEXT: s_mov_b64 s[34:35], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: s_mov_b32 s44, 0x86800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s44, 0x86800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -10463,8 +10463,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 exec, s[34:35]
; GFX6-NEXT: s_mov_b64 s[44:45], exec
; GFX6-NEXT: s_mov_b64 exec, 3
; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: v_mov_b32_e32 v7, 0x21b0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -10494,8 +10494,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_or_b64 exec, exec, vcc
; GFX6-NEXT: s_mov_b64 s[4:5], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: s_mov_b32 s6, 0x80400
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s6, 0x80400
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -10509,8 +10509,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
; GFX6-NEXT: s_mov_b64 s[36:37], s[0:1]
; GFX6-NEXT: s_mov_b64 s[4:5], exec
; GFX6-NEXT: s_mov_b64 exec, 15
; GFX6-NEXT: s_mov_b32 s6, 0x80800
; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0
; GFX6-NEXT: s_mov_b32 s6, 0x80800
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
; GFX6-NEXT: s_waitcnt vmcnt(0)
Expand Down
54 changes: 27 additions & 27 deletions llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
Original file line number Diff line number Diff line change
Expand Up @@ -50,28 +50,28 @@ body: |
; GFX9-NEXT: $vcc = IMPLICIT_DEF
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX9-NEXT: $vcc = IMPLICIT_DEF
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
; GFX9-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
; GFX9-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
;
; GFX10-LABEL: name: check_vcc
Expand All @@ -87,28 +87,28 @@ body: |
; GFX10-NEXT: $vcc = IMPLICIT_DEF
; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX10-NEXT: $vcc = IMPLICIT_DEF
; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
; GFX10-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
; GFX10-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
;
; GFX11-LABEL: name: check_vcc
Expand All @@ -118,28 +118,28 @@ body: |
; GFX11-NEXT: $vcc = IMPLICIT_DEF
; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX11-NEXT: $vcc = IMPLICIT_DEF
; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GFX11-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
; GFX11-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
$vcc = IMPLICIT_DEF
SI_SPILL_S64_SAVE $vcc, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
Expand Down
28 changes: 26 additions & 2 deletions llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 3
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, 9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
; VI-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -56,6 +59,9 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -101,6 +107,8 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 128, addrspace(5)
store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
ret void
Expand All @@ -111,6 +119,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 3
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, 9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -119,7 +130,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel stackrealign_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
; VI-NEXT: .amdhsa_private_segment_fixed_size 8
; VI-NEXT: .amdhsa_private_segment_fixed_size 12
; VI-NEXT: .amdhsa_kernarg_size 0
; VI-NEXT: .amdhsa_user_sgpr_count 6
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
Expand Down Expand Up @@ -159,6 +170,9 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -167,7 +181,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel stackrealign_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
; GFX9-NEXT: .amdhsa_private_segment_fixed_size 8
; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12
; GFX9-NEXT: .amdhsa_kernarg_size 0
; GFX9-NEXT: .amdhsa_user_sgpr_count 6
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
Expand Down Expand Up @@ -204,6 +218,8 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
ret void
Expand All @@ -214,6 +230,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, 3
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, 9
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; VI-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -262,6 +281,9 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, 3
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 9
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -307,6 +329,8 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
store volatile i32 9, ptr addrspace(5) %alloca.align, align 4
ret void
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@

declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1

; ERROR: error: <unknown>:0:0: stack frame size (131061) exceeds limit (131056) in function 'stack_size_limit_wave64'
; GCN: ; ScratchSize: 131061
; ERROR: error: <unknown>:0:0: stack frame size (131064) exceeds limit (131056) in function 'stack_size_limit_wave64'
; GCN: ; ScratchSize: 131064
define amdgpu_kernel void @stack_size_limit_wave64() #0 {
entry:
%alloca = alloca [131057 x i8], align 1, addrspace(5)
call void @llvm.memset.p5.i32(ptr addrspace(5) %alloca, i8 9, i32 131057, i32 1, i1 true)
ret void
}

; ERROR: error: <unknown>:0:0: stack frame size (262117) exceeds limit (262112) in function 'stack_size_limit_wave32'
; GCN: ; ScratchSize: 262117
; ERROR: error: <unknown>:0:0: stack frame size (262120) exceeds limit (262112) in function 'stack_size_limit_wave32'
; GCN: ; ScratchSize: 262120
define amdgpu_kernel void @stack_size_limit_wave32() #1 {
entry:
%alloca = alloca [262113 x i8], align 1, addrspace(5)
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-OPT-NEXT: s_lshr_b32 s6, s0, 5
; WAVE32-OPT-NEXT: s_mov_b64 s[0:1], s[8:9]
; WAVE32-OPT-NEXT: s_mov_b64 s[2:3], s[10:11]
; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
; WAVE32-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0
; WAVE32-OPT-NEXT: s_waitcnt_vscnt null, 0x0
; WAVE32-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4
; WAVE32-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand All @@ -904,7 +904,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE64-OPT-NEXT: s_lshr_b32 s6, s0, 6
; WAVE64-OPT-NEXT: s_mov_b64 s[0:1], s[8:9]
; WAVE64-OPT-NEXT: s_mov_b64 s[2:3], s[10:11]
; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
; WAVE64-OPT-NEXT: buffer_store_dword v0, off, s[8:11], 0
; WAVE64-OPT-NEXT: s_waitcnt_vscnt null, 0x0
; WAVE64-OPT-NEXT: buffer_store_dword v1, off, s[8:11], s32 offset:4
; WAVE64-OPT-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand Down Expand Up @@ -935,10 +935,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5
; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1
; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19
; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:4
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0
; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0
; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21]
; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23]
Expand Down Expand Up @@ -1020,7 +1020,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18
; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:132 ; 4-byte Folded Reload
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1
Expand Down Expand Up @@ -1053,10 +1053,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE64-O0-NEXT: s_lshr_b32 s0, s0, 6
; WAVE64-O0-NEXT: v_writelane_b32 v3, s0, 1
; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21]
; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 42
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:4
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0
; WAVE64-O0-NEXT: s_waitcnt_vscnt null, 0x0
; WAVE64-O0-NEXT: s_mov_b64 s[0:1], s[24:25]
; WAVE64-O0-NEXT: s_mov_b64 s[2:3], s[26:27]
Expand Down Expand Up @@ -1138,7 +1138,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18
; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
; WAVE64-O0-NEXT: s_mov_b64 exec, s[20:21]
; WAVE64-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE64-O0-NEXT: v_readlane_b32 s1, v0, 1
Expand Down Expand Up @@ -1172,7 +1172,7 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5
; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:4
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21]
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23]
Expand Down
68 changes: 34 additions & 34 deletions llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,33 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
Expand Down Expand Up @@ -83,16 +83,16 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:576 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v3, off ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2
; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0
; CHECK-NEXT: s_cbranch_execz .LBB1_2
Expand All @@ -101,23 +101,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
Expand All @@ -134,18 +138,14 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: ; implicit-def: $vgpr0
Expand All @@ -159,23 +159,27 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill
; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill
; CHECK-NEXT: scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
Expand All @@ -192,25 +196,21 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: .LBB1_4: ; %.exit
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 768
; GCN: ScratchSize: 640

define amdgpu_vs void @main(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, ptr addrspace(4) inreg %arg3, ptr addrspace(4) inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
bb:
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_mov_b32_e32 v2, v0
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4
Expand All @@ -32,28 +32,28 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-NEXT: v_writelane_b32 v0, s4, 0
; CHECK-NEXT: v_writelane_b32 v0, s5, 1
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %bb193
; CHECK-NEXT: .LBB0_2: ; %bb194
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
; CHECK-NEXT: s_cbranch_vccnz .LBB0_4
; CHECK-NEXT: ; %bb.3: ; %bb201
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, V2@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, V2@rel32@hi+12
Expand All @@ -66,7 +66,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-NEXT: ; divergent unreachable
; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
Expand Down
Loading