833 changes: 303 additions & 530 deletions llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Large diffs are not rendered by default.

12 changes: 5 additions & 7 deletions llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


; GCN-LABEL: {{^}}divergent_if_endif:
; VGPR: workitem_private_segment_byte_size = 16{{$}}
; VGPR: workitem_private_segment_byte_size = 12{{$}}


; GCN: {{^}}; %bb.0:
Expand All @@ -19,7 +19,7 @@

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}}
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}}

; Spill saved exec
; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
Expand Down Expand Up @@ -82,13 +82,13 @@ endif:
}

; GCN-LABEL: {{^}}divergent_loop:
; VGPR: workitem_private_segment_byte_size = 20{{$}}
; VGPR: workitem_private_segment_byte_size = 16{{$}}

; GCN: {{^}}; %bb.0:
; GCN-DAG: s_mov_b32 m0, -1
; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}}
; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]]
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}}
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}}

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
Expand Down Expand Up @@ -166,7 +166,7 @@ end:
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]]
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]]

; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
Expand All @@ -175,7 +175,6 @@ end:
; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
Expand All @@ -188,7 +187,6 @@ end:
; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]

; GCN: [[FLOW]]: ; %Flow
; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
Expand Down
24 changes: 16 additions & 8 deletions llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,22 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -67,20 +69,22 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -103,20 +107,22 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -139,10 +145,11 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
Expand All @@ -151,9 +158,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 16
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
Expand All @@ -48,6 +48,7 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v40, s47, 15
; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v42, s16, 0
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
Expand Down Expand Up @@ -91,9 +92,10 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 16
; CHECK-NEXT: v_readlane_b32 s4, v42, 0
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
Expand Down
279 changes: 0 additions & 279 deletions llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir

This file was deleted.

10 changes: 6 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v42, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x800
; GCN-NEXT: v_writelane_b32 v43, s16, 0
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
Expand All @@ -38,11 +39,12 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: v_readlane_b32 s4, v42, 2
; GCN-NEXT: v_readlane_b32 s4, v43, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_addk_i32 s32, 0xf800
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand Down
74 changes: 50 additions & 24 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0
; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, exec_lo
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5
; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 4
; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1
; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_OPT-NEXT: scratch_load_dword v0, off, s5
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s4
; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8
; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1
; FLAT_SCR_OPT-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -222,31 +228,44 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_OPT-NEXT: ;;#ASMEND
; FLAT_SCR_OPT-NEXT: ;;#ASMSTART
; FLAT_SCR_OPT-NEXT: ;;#ASMEND
; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v1, s3
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0
; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1
; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0
; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1
; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1]
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2
; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0
; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1]
; FLAT_SCR_OPT-NEXT: s_endpgm
;
; FLAT_SCR_ARCH-LABEL: test:
; FLAT_SCR_ARCH: ; %bb.0:
; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, exec_lo
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5
; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 4
; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1
; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v0, off, s5
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s4
; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8
; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1
; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -343,17 +362,24 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_ARCH-NEXT: ;;#ASMEND
; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART
; FLAT_SCR_ARCH-NEXT: ;;#ASMEND
; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v1, s3
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0
; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1
; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0
; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1
; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1]
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2
; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0
; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1]
; FLAT_SCR_ARCH-NEXT: s_endpgm
call void asm sideeffect "", "~{s[0:7]}" ()
call void asm sideeffect "", "~{s[8:15]}" ()
Expand Down
46 changes: 26 additions & 20 deletions llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec_lo
Expand All @@ -37,13 +38,14 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec_hi
Expand All @@ -62,16 +64,17 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $exec
; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec
Expand All @@ -93,12 +96,13 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo
Expand All @@ -116,12 +120,13 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi
Expand All @@ -139,15 +144,16 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def $m0
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
Expand All @@ -43,12 +44,13 @@ body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[8:9], -1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9]
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s4, 2
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0
; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s4, 0
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -28,9 +29,10 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v41, 0
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00
; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-LABEL: gfx_func:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_mov_b32 s36, s33
; SDAG-NEXT: s_mov_b32 s38, s33
; SDAG-NEXT: s_mov_b32 s33, s32
; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -81,14 +81,14 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; SDAG-NEXT: s_mov_b64 exec, s[34:35]
; SDAG-NEXT: s_addk_i32 s32, 0xfc00
; SDAG-NEXT: s_mov_b32 s33, s36
; SDAG-NEXT: s_mov_b32 s33, s38
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: gfx_func:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s36, s33
; GISEL-NEXT: s_mov_b32 s38, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -161,7 +161,7 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s36
; GISEL-NEXT: s_mov_b32 s33, s38
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
call void @extern_c_func()
Expand Down
4,149 changes: 2,608 additions & 1,541 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll

Large diffs are not rendered by default.

396 changes: 244 additions & 152 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll

Large diffs are not rendered by default.

112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Large diffs are not rendered by default.

72 changes: 44 additions & 28 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -465,9 +466,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -481,8 +483,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -549,9 +552,10 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -569,8 +573,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -640,9 +645,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -656,8 +662,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -725,9 +732,10 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -745,8 +753,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -815,9 +824,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -831,8 +841,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -901,9 +912,10 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -922,8 +934,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 20
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -1001,9 +1014,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 20
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -1017,8 +1031,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v40, s16, 20
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -1096,9 +1111,10 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 20
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand Down Expand Up @@ -1311,7 +1327,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1402,14 +1418,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1500,7 +1516,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
call amdgpu_gfx void %fptr(i32 %i)
Expand All @@ -1515,7 +1531,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1604,14 +1620,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1700,7 +1716,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call amdgpu_gfx i32 %fptr(i32 %i)
Expand All @@ -1712,7 +1728,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1798,14 +1814,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1891,7 +1907,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
tail call amdgpu_gfx void %fptr()
Expand Down
73 changes: 36 additions & 37 deletions llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ define void @f0() {
; GFX11-LABEL: f0:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
Expand All @@ -28,7 +28,7 @@ define void @f0() {
; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -53,28 +53,28 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5]
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x24
; GFX11-NEXT: v_mov_b32_e32 v31, v0
; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24
; GFX11-NEXT: s_mov_b32 s18, s14
; GFX11-NEXT: s_mov_b32 s12, s13
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s20, 0
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31
; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s19, exec_lo
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0
; GFX11-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB2_13
; GFX11-NEXT: ; %bb.1: ; %bb14
; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s21, 0
; GFX11-NEXT: s_cselect_b32 s25, -1, 0
; GFX11-NEXT: s_cselect_b32 s24, -1, 0
; GFX11-NEXT: s_bitcmp0_b32 s21, 0
; GFX11-NEXT: s_mov_b32 s21, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_3
Expand All @@ -90,41 +90,40 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s1, -1
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_4
; GFX11-NEXT: s_cbranch_execz .LBB2_4
; GFX11-NEXT: s_branch .LBB2_12
; GFX11-NEXT: .LBB2_3:
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB2_4: ; %bb16
; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54
; GFX11-NEXT: s_load_b32 s3, s[16:17], 0x54
; GFX11-NEXT: s_bitcmp1_b32 s23, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_and_b32 s3, s23, 1
; GFX11-NEXT: s_and_b32 s9, s23, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_bitcmp1_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s3, -1
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s3, 0
; GFX11-NEXT: s_cmp_eq_u32 s9, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_8
; GFX11-NEXT: ; %bb.5: ; %bb18.preheader
; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28
; GFX11-NEXT: s_mul_i32 s3, s29, s28
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25
; GFX11-NEXT: s_mul_hi_u32 s3, s29, s28
; GFX11-NEXT: s_mul_i32 s9, s29, s28
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s3, s9, 1
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s2, 1
; GFX11-NEXT: s_lshr_b32 s2, s2, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s2, s2, s22
; GFX11-NEXT: s_mul_i32 s2, s2, s20
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s24, s2
; GFX11-NEXT: s_or_b32 s3, s3, 1
; GFX11-NEXT: s_lshr_b32 s3, s3, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s9, s3, s22
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mul_i32 s9, s9, s20
; GFX11-NEXT: s_or_b32 s2, s2, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1
; GFX11-NEXT: global_load_u16 v2, v1, s[22:23]
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -152,10 +151,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_or_b32 s3, s2, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: .LBB2_8: ; %Flow12
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
; GFX11-NEXT: ; %bb.9:
; GFX11-NEXT: s_xor_b32 s0, s8, -1
Expand All @@ -167,11 +166,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s21, -1
; GFX11-NEXT: .LBB2_12: ; %Flow11
; GFX11-NEXT: s_and_b32 s3, s1, exec_lo
; GFX11-NEXT: s_and_b32 s20, s1, exec_lo
; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo
; GFX11-NEXT: .LBB2_13: ; %Flow9
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s19, s0
; GFX11-NEXT: s_and_saveexec_b32 s2, s0
; GFX11-NEXT: s_cbranch_execz .LBB2_15
; GFX11-NEXT: ; %bb.14: ; %bb43
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
Expand All @@ -184,10 +183,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s14, s15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_or_b32 s3, s3, exec_lo
; GFX11-NEXT: s_or_b32 s20, s20, exec_lo
; GFX11-NEXT: .LBB2_15: ; %Flow14
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s0, s3
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX11-NEXT: s_and_saveexec_b32 s0, s20
; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock
; GFX11-NEXT: ; divergent unreachable
; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock
Expand Down
44 changes: 8 additions & 36 deletions llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,17 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s12, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: v_writelane_b32 v40, s16, 0
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: v_readlane_b32 s14, v40, 0
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v0, s8, 1
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v40, s8, 1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def vgpr10
; CHECK-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -69,39 +56,24 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: v_readlane_b32 s4, v0, 1
; CHECK-NEXT: v_readlane_b32 s4, v40, 1
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %store
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100000
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b32 v1, v2
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: ds_write_b32 v0, v1
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .LBB0_2: ; %end
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
%arr = alloca < 1339 x i32>, align 8192, addrspace(5)
%cmp = icmp ne i32 %val, 0
Expand Down
5 changes: 0 additions & 5 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI lower SGPR spill instructions
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: SI Fix VGPR copies
; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis
; GCN-O0-NEXT: Fixup Statepoint Caller Saved
Expand Down Expand Up @@ -361,7 +360,6 @@
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: GCN NSA Reassign
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -655,7 +653,6 @@
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -959,7 +956,6 @@
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: GCN NSA Reassign
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -1275,7 +1271,6 @@
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: GCN NSA Reassign
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: Stack Slot Coloring
Expand Down
590 changes: 241 additions & 349 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll

Large diffs are not rendered by default.

677 changes: 281 additions & 396 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,13 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_writelane_b32 v40, s4, 5
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v44, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s36, 3
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
Expand Down Expand Up @@ -229,9 +230,10 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 5
; GFX9-NEXT: v_readlane_b32 s4, v44, 0
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xf800
; GFX9-NEXT: s_mov_b32 s33, s4
Expand Down
47 changes: 11 additions & 36 deletions llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s24, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand All @@ -54,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_mov_b32 s33, s24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -66,16 +66,12 @@ bb:
define amdgpu_kernel void @kernel_call() {
; CHECK-LABEL: kernel_call:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -93,10 +89,6 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_callee()
Expand All @@ -121,9 +113,9 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1
; CHECK-NEXT: s_xor_saveexec_b64 s[20:21], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: s_mov_b64 exec, s[20:21]
; CHECK-NEXT: s_setpc_b64 s[16:17]
bb:
call void asm sideeffect "; clobber csr v40", "~{v40}"()
Expand All @@ -134,16 +126,12 @@ bb:
define amdgpu_kernel void @kernel_tailcall() {
; CHECK-LABEL: kernel_tailcall:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -161,10 +149,6 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
Expand All @@ -188,7 +172,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s24, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
Expand All @@ -210,7 +194,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_mov_b32 s33, s24
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
Expand All @@ -222,7 +206,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s19, s33
; CHECK-NEXT: s_mov_b32 s25, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
Expand All @@ -244,7 +228,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s19
; CHECK-NEXT: s_mov_b32 s33, s25
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
Expand All @@ -255,16 +239,12 @@ entry:
define protected amdgpu_kernel void @kernel() {
; CHECK-LABEL: kernel:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -282,11 +262,6 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr0 killed $exec
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
entry:
%call = call i32 @caller_save_vgpr_spill_fp()
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/nested-calls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ declare void @external_void_func_i32(i32) #0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
; GCN-DAG: v_writelane_b32 v40, s30, 0
; GCN-DAG: v_writelane_b32 v40, s31, 1

Expand All @@ -25,9 +26,10 @@ declare void @external_void_func_i32(i32) #0
; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s30, v40, 0

; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
Expand Down
8 changes: 5 additions & 3 deletions llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
; CHECK-NEXT: v_writelane_b32 v41, s16, 0
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
Expand All @@ -38,11 +39,12 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: v_readlane_b32 s4, v41, 0
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
Expand Down
1,233 changes: 566 additions & 667 deletions llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

Large diffs are not rendered by default.

816 changes: 0 additions & 816 deletions llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

This file was deleted.

391 changes: 16 additions & 375 deletions llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll

Large diffs are not rendered by default.

5 changes: 0 additions & 5 deletions llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: GCN NSA Reassign
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: Stack Slot Coloring

; O0: Fast Register Allocator
; O0-NEXT: SI lower SGPR spill instructions
; O0-NEXT: Fast Register Allocator
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: SI Fix VGPR copies


Expand All @@ -51,7 +49,6 @@
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
Expand All @@ -64,7 +61,6 @@
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
Expand All @@ -83,7 +79,6 @@
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: GCN NSA Reassign
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: Stack Slot Coloring
Expand Down
37 changes: 8 additions & 29 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s

# After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0.
# Otherwise, the test would crash during PEI while trying to replace the dead frame index.
Expand Down Expand Up @@ -40,33 +39,13 @@ machineFunctionInfo:
workGroupIDX: { reg: '$sgpr8' }
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
body: |
; SGPR_SPILL-LABEL: name: test
; SGPR_SPILL: bb.0:
; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000)
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: bb.1:
; SGPR_SPILL-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: S_ENDPGM 0
; PEI-LABEL: name: test
; PEI: bb.0:
; PEI-NEXT: successors: %bb.1(0x80000000)
; PEI-NEXT: {{ $}}
; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; PEI-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: bb.1:
; PEI-NEXT: liveins: $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 0
; PEI-NEXT: KILL killed renamable $vgpr0
; PEI-NEXT: S_ENDPGM 0
; CHECK-LABEL: name: test
; CHECK: bb.0:
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0
; CHECK: DBG_VALUE $noreg, 0
; CHECK: bb.1:
; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0
; CHECK: S_ENDPGM 0
bb.0:
renamable $sgpr10 = IMPLICIT_DEF
SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s

# After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0.
# Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise.
Expand Down Expand Up @@ -45,7 +45,7 @@ machineFunctionInfo:
body: |
; CHECK-LABEL: name: test
; CHECK: bb.0:
; CHECK: DBG_VALUE
; CHECK: DBG_VALUE $noreg, 0
bb.0:
renamable $sgpr10 = IMPLICIT_DEF
SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
Expand Down
300 changes: 142 additions & 158 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
Expand All @@ -31,194 +23,186 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_writelane_b32 v1, s8, 0
; GCN-NEXT: v_writelane_b32 v1, s9, 1
; GCN-NEXT: v_writelane_b32 v1, s10, 2
; GCN-NEXT: v_writelane_b32 v1, s11, 3
; GCN-NEXT: v_writelane_b32 v1, s12, 4
; GCN-NEXT: v_writelane_b32 v1, s13, 5
; GCN-NEXT: v_writelane_b32 v1, s14, 6
; GCN-NEXT: v_writelane_b32 v1, s15, 7
; GCN-NEXT: v_writelane_b32 v1, s16, 8
; GCN-NEXT: v_writelane_b32 v1, s17, 9
; GCN-NEXT: v_writelane_b32 v1, s18, 10
; GCN-NEXT: v_writelane_b32 v1, s19, 11
; GCN-NEXT: v_writelane_b32 v1, s20, 12
; GCN-NEXT: v_writelane_b32 v1, s21, 13
; GCN-NEXT: v_writelane_b32 v1, s22, 14
; GCN-NEXT: v_writelane_b32 v1, s23, 15
; GCN-NEXT: v_writelane_b32 v23, s8, 0
; GCN-NEXT: v_writelane_b32 v23, s9, 1
; GCN-NEXT: v_writelane_b32 v23, s10, 2
; GCN-NEXT: v_writelane_b32 v23, s11, 3
; GCN-NEXT: v_writelane_b32 v23, s12, 4
; GCN-NEXT: v_writelane_b32 v23, s13, 5
; GCN-NEXT: v_writelane_b32 v23, s14, 6
; GCN-NEXT: v_writelane_b32 v23, s15, 7
; GCN-NEXT: v_writelane_b32 v23, s16, 8
; GCN-NEXT: v_writelane_b32 v23, s17, 9
; GCN-NEXT: v_writelane_b32 v23, s18, 10
; GCN-NEXT: v_writelane_b32 v23, s19, 11
; GCN-NEXT: v_writelane_b32 v23, s20, 12
; GCN-NEXT: v_writelane_b32 v23, s21, 13
; GCN-NEXT: v_writelane_b32 v23, s22, 14
; GCN-NEXT: v_writelane_b32 v23, s23, 15
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 16
; GCN-NEXT: v_writelane_b32 v1, s9, 17
; GCN-NEXT: v_writelane_b32 v1, s10, 18
; GCN-NEXT: v_writelane_b32 v1, s11, 19
; GCN-NEXT: v_writelane_b32 v1, s12, 20
; GCN-NEXT: v_writelane_b32 v1, s13, 21
; GCN-NEXT: v_writelane_b32 v1, s14, 22
; GCN-NEXT: v_writelane_b32 v1, s15, 23
; GCN-NEXT: v_writelane_b32 v1, s16, 24
; GCN-NEXT: v_writelane_b32 v1, s17, 25
; GCN-NEXT: v_writelane_b32 v1, s18, 26
; GCN-NEXT: v_writelane_b32 v1, s19, 27
; GCN-NEXT: v_writelane_b32 v1, s20, 28
; GCN-NEXT: v_writelane_b32 v1, s21, 29
; GCN-NEXT: v_writelane_b32 v1, s22, 30
; GCN-NEXT: v_writelane_b32 v1, s23, 31
; GCN-NEXT: v_writelane_b32 v23, s8, 16
; GCN-NEXT: v_writelane_b32 v23, s9, 17
; GCN-NEXT: v_writelane_b32 v23, s10, 18
; GCN-NEXT: v_writelane_b32 v23, s11, 19
; GCN-NEXT: v_writelane_b32 v23, s12, 20
; GCN-NEXT: v_writelane_b32 v23, s13, 21
; GCN-NEXT: v_writelane_b32 v23, s14, 22
; GCN-NEXT: v_writelane_b32 v23, s15, 23
; GCN-NEXT: v_writelane_b32 v23, s16, 24
; GCN-NEXT: v_writelane_b32 v23, s17, 25
; GCN-NEXT: v_writelane_b32 v23, s18, 26
; GCN-NEXT: v_writelane_b32 v23, s19, 27
; GCN-NEXT: v_writelane_b32 v23, s20, 28
; GCN-NEXT: v_writelane_b32 v23, s21, 29
; GCN-NEXT: v_writelane_b32 v23, s22, 30
; GCN-NEXT: v_writelane_b32 v23, s23, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 32
; GCN-NEXT: v_writelane_b32 v1, s9, 33
; GCN-NEXT: v_writelane_b32 v1, s10, 34
; GCN-NEXT: v_writelane_b32 v1, s11, 35
; GCN-NEXT: v_writelane_b32 v1, s12, 36
; GCN-NEXT: v_writelane_b32 v1, s13, 37
; GCN-NEXT: v_writelane_b32 v1, s14, 38
; GCN-NEXT: v_writelane_b32 v1, s15, 39
; GCN-NEXT: v_writelane_b32 v1, s16, 40
; GCN-NEXT: v_writelane_b32 v1, s17, 41
; GCN-NEXT: v_writelane_b32 v1, s18, 42
; GCN-NEXT: v_writelane_b32 v1, s19, 43
; GCN-NEXT: v_writelane_b32 v1, s20, 44
; GCN-NEXT: v_writelane_b32 v1, s21, 45
; GCN-NEXT: v_writelane_b32 v1, s22, 46
; GCN-NEXT: v_writelane_b32 v1, s23, 47
; GCN-NEXT: v_writelane_b32 v23, s8, 32
; GCN-NEXT: v_writelane_b32 v23, s9, 33
; GCN-NEXT: v_writelane_b32 v23, s10, 34
; GCN-NEXT: v_writelane_b32 v23, s11, 35
; GCN-NEXT: v_writelane_b32 v23, s12, 36
; GCN-NEXT: v_writelane_b32 v23, s13, 37
; GCN-NEXT: v_writelane_b32 v23, s14, 38
; GCN-NEXT: v_writelane_b32 v23, s15, 39
; GCN-NEXT: v_writelane_b32 v23, s16, 40
; GCN-NEXT: v_writelane_b32 v23, s17, 41
; GCN-NEXT: v_writelane_b32 v23, s18, 42
; GCN-NEXT: v_writelane_b32 v23, s19, 43
; GCN-NEXT: v_writelane_b32 v23, s20, 44
; GCN-NEXT: v_writelane_b32 v23, s21, 45
; GCN-NEXT: v_writelane_b32 v23, s22, 46
; GCN-NEXT: v_writelane_b32 v23, s23, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 48
; GCN-NEXT: v_writelane_b32 v1, s9, 49
; GCN-NEXT: v_writelane_b32 v1, s10, 50
; GCN-NEXT: v_writelane_b32 v1, s11, 51
; GCN-NEXT: v_writelane_b32 v1, s12, 52
; GCN-NEXT: v_writelane_b32 v1, s13, 53
; GCN-NEXT: v_writelane_b32 v1, s14, 54
; GCN-NEXT: v_writelane_b32 v1, s15, 55
; GCN-NEXT: v_writelane_b32 v1, s16, 56
; GCN-NEXT: v_writelane_b32 v1, s17, 57
; GCN-NEXT: v_writelane_b32 v1, s18, 58
; GCN-NEXT: v_writelane_b32 v1, s19, 59
; GCN-NEXT: v_writelane_b32 v1, s20, 60
; GCN-NEXT: v_writelane_b32 v1, s21, 61
; GCN-NEXT: v_writelane_b32 v1, s22, 62
; GCN-NEXT: v_writelane_b32 v1, s23, 63
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: v_writelane_b32 v23, s8, 48
; GCN-NEXT: v_writelane_b32 v23, s9, 49
; GCN-NEXT: v_writelane_b32 v23, s10, 50
; GCN-NEXT: v_writelane_b32 v23, s11, 51
; GCN-NEXT: v_writelane_b32 v23, s12, 52
; GCN-NEXT: v_writelane_b32 v23, s13, 53
; GCN-NEXT: v_writelane_b32 v23, s14, 54
; GCN-NEXT: v_writelane_b32 v23, s15, 55
; GCN-NEXT: v_writelane_b32 v23, s16, 56
; GCN-NEXT: v_writelane_b32 v23, s17, 57
; GCN-NEXT: v_writelane_b32 v23, s18, 58
; GCN-NEXT: v_writelane_b32 v23, s19, 59
; GCN-NEXT: v_writelane_b32 v23, s20, 60
; GCN-NEXT: v_writelane_b32 v23, s21, 61
; GCN-NEXT: v_writelane_b32 v23, s22, 62
; GCN-NEXT: v_writelane_b32 v23, s23, 63
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[6:7]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: s_mov_b64 s[8:9], exec
; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: v_writelane_b32 v0, s6, 0
; GCN-NEXT: v_writelane_b32 v0, s7, 1
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[8:9]
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, s5
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v1, 0
; GCN-NEXT: v_readlane_b32 s5, v1, 1
; GCN-NEXT: v_readlane_b32 s6, v1, 2
; GCN-NEXT: v_readlane_b32 s7, v1, 3
; GCN-NEXT: v_readlane_b32 s8, v1, 4
; GCN-NEXT: v_readlane_b32 s9, v1, 5
; GCN-NEXT: v_readlane_b32 s10, v1, 6
; GCN-NEXT: v_readlane_b32 s11, v1, 7
; GCN-NEXT: v_readlane_b32 s12, v1, 8
; GCN-NEXT: v_readlane_b32 s13, v1, 9
; GCN-NEXT: v_readlane_b32 s14, v1, 10
; GCN-NEXT: v_readlane_b32 s15, v1, 11
; GCN-NEXT: v_readlane_b32 s16, v1, 12
; GCN-NEXT: v_readlane_b32 s17, v1, 13
; GCN-NEXT: v_readlane_b32 s18, v1, 14
; GCN-NEXT: v_readlane_b32 s19, v1, 15
; GCN-NEXT: v_readlane_b32 s4, v23, 0
; GCN-NEXT: v_readlane_b32 s5, v23, 1
; GCN-NEXT: v_readlane_b32 s6, v23, 2
; GCN-NEXT: v_readlane_b32 s7, v23, 3
; GCN-NEXT: v_readlane_b32 s8, v23, 4
; GCN-NEXT: v_readlane_b32 s9, v23, 5
; GCN-NEXT: v_readlane_b32 s10, v23, 6
; GCN-NEXT: v_readlane_b32 s11, v23, 7
; GCN-NEXT: v_readlane_b32 s12, v23, 8
; GCN-NEXT: v_readlane_b32 s13, v23, 9
; GCN-NEXT: v_readlane_b32 s14, v23, 10
; GCN-NEXT: v_readlane_b32 s15, v23, 11
; GCN-NEXT: v_readlane_b32 s16, v23, 12
; GCN-NEXT: v_readlane_b32 s17, v23, 13
; GCN-NEXT: v_readlane_b32 s18, v23, 14
; GCN-NEXT: v_readlane_b32 s19, v23, 15
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v1, 16
; GCN-NEXT: v_readlane_b32 s5, v1, 17
; GCN-NEXT: v_readlane_b32 s6, v1, 18
; GCN-NEXT: v_readlane_b32 s7, v1, 19
; GCN-NEXT: v_readlane_b32 s8, v1, 20
; GCN-NEXT: v_readlane_b32 s9, v1, 21
; GCN-NEXT: v_readlane_b32 s10, v1, 22
; GCN-NEXT: v_readlane_b32 s11, v1, 23
; GCN-NEXT: v_readlane_b32 s12, v1, 24
; GCN-NEXT: v_readlane_b32 s13, v1, 25
; GCN-NEXT: v_readlane_b32 s14, v1, 26
; GCN-NEXT: v_readlane_b32 s15, v1, 27
; GCN-NEXT: v_readlane_b32 s16, v1, 28
; GCN-NEXT: v_readlane_b32 s17, v1, 29
; GCN-NEXT: v_readlane_b32 s18, v1, 30
; GCN-NEXT: v_readlane_b32 s19, v1, 31
; GCN-NEXT: v_readlane_b32 s4, v23, 16
; GCN-NEXT: v_readlane_b32 s5, v23, 17
; GCN-NEXT: v_readlane_b32 s6, v23, 18
; GCN-NEXT: v_readlane_b32 s7, v23, 19
; GCN-NEXT: v_readlane_b32 s8, v23, 20
; GCN-NEXT: v_readlane_b32 s9, v23, 21
; GCN-NEXT: v_readlane_b32 s10, v23, 22
; GCN-NEXT: v_readlane_b32 s11, v23, 23
; GCN-NEXT: v_readlane_b32 s12, v23, 24
; GCN-NEXT: v_readlane_b32 s13, v23, 25
; GCN-NEXT: v_readlane_b32 s14, v23, 26
; GCN-NEXT: v_readlane_b32 s15, v23, 27
; GCN-NEXT: v_readlane_b32 s16, v23, 28
; GCN-NEXT: v_readlane_b32 s17, v23, 29
; GCN-NEXT: v_readlane_b32 s18, v23, 30
; GCN-NEXT: v_readlane_b32 s19, v23, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v1, 32
; GCN-NEXT: v_readlane_b32 s5, v1, 33
; GCN-NEXT: v_readlane_b32 s6, v1, 34
; GCN-NEXT: v_readlane_b32 s7, v1, 35
; GCN-NEXT: v_readlane_b32 s8, v1, 36
; GCN-NEXT: v_readlane_b32 s9, v1, 37
; GCN-NEXT: v_readlane_b32 s10, v1, 38
; GCN-NEXT: v_readlane_b32 s11, v1, 39
; GCN-NEXT: v_readlane_b32 s12, v1, 40
; GCN-NEXT: v_readlane_b32 s13, v1, 41
; GCN-NEXT: v_readlane_b32 s14, v1, 42
; GCN-NEXT: v_readlane_b32 s15, v1, 43
; GCN-NEXT: v_readlane_b32 s16, v1, 44
; GCN-NEXT: v_readlane_b32 s17, v1, 45
; GCN-NEXT: v_readlane_b32 s18, v1, 46
; GCN-NEXT: v_readlane_b32 s19, v1, 47
; GCN-NEXT: v_readlane_b32 s4, v23, 32
; GCN-NEXT: v_readlane_b32 s5, v23, 33
; GCN-NEXT: v_readlane_b32 s6, v23, 34
; GCN-NEXT: v_readlane_b32 s7, v23, 35
; GCN-NEXT: v_readlane_b32 s8, v23, 36
; GCN-NEXT: v_readlane_b32 s9, v23, 37
; GCN-NEXT: v_readlane_b32 s10, v23, 38
; GCN-NEXT: v_readlane_b32 s11, v23, 39
; GCN-NEXT: v_readlane_b32 s12, v23, 40
; GCN-NEXT: v_readlane_b32 s13, v23, 41
; GCN-NEXT: v_readlane_b32 s14, v23, 42
; GCN-NEXT: v_readlane_b32 s15, v23, 43
; GCN-NEXT: v_readlane_b32 s16, v23, 44
; GCN-NEXT: v_readlane_b32 s17, v23, 45
; GCN-NEXT: v_readlane_b32 s18, v23, 46
; GCN-NEXT: v_readlane_b32 s19, v23, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s8, v1, 48
; GCN-NEXT: v_readlane_b32 s9, v1, 49
; GCN-NEXT: v_readlane_b32 s10, v1, 50
; GCN-NEXT: v_readlane_b32 s11, v1, 51
; GCN-NEXT: v_readlane_b32 s12, v1, 52
; GCN-NEXT: v_readlane_b32 s13, v1, 53
; GCN-NEXT: v_readlane_b32 s14, v1, 54
; GCN-NEXT: v_readlane_b32 s15, v1, 55
; GCN-NEXT: v_readlane_b32 s16, v1, 56
; GCN-NEXT: v_readlane_b32 s17, v1, 57
; GCN-NEXT: v_readlane_b32 s18, v1, 58
; GCN-NEXT: v_readlane_b32 s19, v1, 59
; GCN-NEXT: v_readlane_b32 s20, v1, 60
; GCN-NEXT: v_readlane_b32 s21, v1, 61
; GCN-NEXT: v_readlane_b32 s22, v1, 62
; GCN-NEXT: v_readlane_b32 s23, v1, 63
; GCN-NEXT: v_readlane_b32 s8, v23, 48
; GCN-NEXT: v_readlane_b32 s9, v23, 49
; GCN-NEXT: v_readlane_b32 s10, v23, 50
; GCN-NEXT: v_readlane_b32 s11, v23, 51
; GCN-NEXT: v_readlane_b32 s12, v23, 52
; GCN-NEXT: v_readlane_b32 s13, v23, 53
; GCN-NEXT: v_readlane_b32 s14, v23, 54
; GCN-NEXT: v_readlane_b32 s15, v23, 55
; GCN-NEXT: v_readlane_b32 s16, v23, 56
; GCN-NEXT: v_readlane_b32 s17, v23, 57
; GCN-NEXT: v_readlane_b32 s18, v23, 58
; GCN-NEXT: v_readlane_b32 s19, v23, 59
; GCN-NEXT: v_readlane_b32 s20, v23, 60
; GCN-NEXT: v_readlane_b32 s21, v23, 61
; GCN-NEXT: v_readlane_b32 s22, v23, 62
; GCN-NEXT: v_readlane_b32 s23, v23, 63
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v0, 0
; GCN-NEXT: v_readlane_b32 s5, v0, 1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:5]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB0_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
; GCN-NEXT: s_endpgm
call void asm sideeffect "", "~{v[0:7]}" () #0
call void asm sideeffect "", "~{v[8:15]}" () #0
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@ body: |
liveins: $sgpr4
; CHECK-LABEL: name: sgpr_spill_s64_undef_high32
; CHECK: liveins: $sgpr4
; CHECK: liveins: $sgpr4, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
...
Expand All @@ -46,11 +45,10 @@ body: |
liveins: $sgpr5
; CHECK-LABEL: name: sgpr_spill_s64_undef_low32
; CHECK: liveins: $sgpr5
; CHECK: liveins: $sgpr5, $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
...
Loading