750 changes: 366 additions & 384 deletions llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@

; ALL-LABEL: {{^}}kernel:
; GFX908: .amdhsa_next_free_vgpr 32
; GFX908-NEXT: .amdhsa_next_free_sgpr 33
; GFX908-NEXT: .amdhsa_next_free_sgpr 36

; GFX90A: .amdhsa_next_free_vgpr 59
; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
; GFX90A: .amdhsa_next_free_vgpr 65
; GFX90A-NEXT: .amdhsa_next_free_sgpr 36
; GFX90A-NEXT: .amdhsa_accum_offset 32
define amdgpu_kernel void @kernel() #0 {
bb:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

; CHECK-LABEL: {{^}}kernel0:
; CHECK: .amdhsa_next_free_vgpr 53
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK-NEXT: .amdhsa_next_free_sgpr 36
define amdgpu_kernel void @kernel0() #0 {
bb:
call void @alias0() #2
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

; CHECK-LABEL: {{^}}kernel1:
; CHECK: .amdhsa_next_free_vgpr 41
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK-NEXT: .amdhsa_next_free_sgpr 36
define amdgpu_kernel void @kernel1() #0 {
bb:
call void asm sideeffect "; clobber v40 ", "~{v40}"()
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

; CHECK-LABEL: {{^}}kernel2:
; CHECK: .amdhsa_next_free_vgpr 53
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK-NEXT: .amdhsa_next_free_sgpr 36
define amdgpu_kernel void @kernel2() #0 {
bb:
call void @alias2() #2
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

; CHECK-LABEL: {{^}}kernel3:
; CHECK: .amdhsa_next_free_vgpr 253
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK-NEXT: .amdhsa_next_free_sgpr 36
define amdgpu_kernel void @kernel3() #0 {
bb:
call void @alias3() #2
Expand Down
2,684 changes: 1,463 additions & 1,221 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll

Large diffs are not rendered by default.

89 changes: 47 additions & 42 deletions llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,20 @@
define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; GCN-LABEL: call_memory_arg_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_load_dword s6, s[6:7], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_endpgm
%vgpr = load volatile i32, ptr addrspace(3) %ptr
call void @func(i32 %vgpr)
Expand All @@ -28,20 +29,21 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_memory_no_dep:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v0, v0, s[4:5]
; GCN-NEXT: global_store_dword v0, v0, s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_endpgm
store i32 0, ptr addrspace(1) %ptr
call void @func(i32 0)
Expand All @@ -52,18 +54,19 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+12
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: global_store_dword v40, v40, s[34:35]
; GCN-NEXT: s_endpgm
call void @func(i32 0)
Expand All @@ -74,18 +77,19 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_no_wait_after_call_return_val:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12
; GCN-NEXT: s_getpc_b64 s[8:9]
; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: global_store_dword v40, v0, s[34:35]
; GCN-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
Expand All @@ -97,18 +101,19 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
; GCN-LABEL: call_got_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
; GCN-NEXT: s_add_u32 s0, s0, s11
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: s_endpgm
call void @got.func(i32 0)
ret void
Expand Down
20 changes: 12 additions & 8 deletions llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ define hidden void @use_queue_ptr() #1 {
}

; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN: .amdhsa_user_sgpr_queue_ptr 0
; GCN: s_mov_b64 s[6:7], s[4:5]
; GCN: .amdhsa_user_sgpr_queue_ptr 1
define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
call void @use_queue_ptr()
ret void
}

; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x0
; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]

Expand All @@ -52,8 +52,8 @@ define hidden void @use_queue_ptr_addrspacecast() #1 {
}

; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
; CIVI: s_swappc_b64 s[30:31], s[4:5]
; CIVI: .amdhsa_user_sgpr_queue_ptr 0
; CIVI: s_mov_b64 s[6:7], s[4:5]
; CIVI: .amdhsa_user_sgpr_queue_ptr 1

; GFX9-NOT: s_mov_b64 s[6:7]
; GFX9: .amdhsa_user_sgpr_queue_ptr 0
Expand Down Expand Up @@ -463,12 +463,15 @@ define hidden void @use_every_sgpr_input() #1 {
}

; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
; GCN: s_mov_b32 s13, s15
; GCN: s_mov_b32 s12, s14
; GCN: s_mov_b32 s14, s16
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64

; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
; GCN: .amdhsa_user_sgpr_dispatch_ptr 1
; GCN: .amdhsa_user_sgpr_queue_ptr 0
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
Expand All @@ -487,13 +490,14 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; We have to pass the kernarg segment, but there are no kernel
; arguments so null is passed.
; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input_no_kernargs:
; GCN: s_mov_b64 s[10:11], s[6:7]
; GCN: s_mov_b64 s[10:11], s[8:9]
; GCN: s_mov_b64 s[8:9], 0{{$}}
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64

; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
; GCN: .amdhsa_user_sgpr_dispatch_ptr 1
; GCN: .amdhsa_user_sgpr_queue_ptr 0
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
Expand Down
304 changes: 160 additions & 144 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll

Large diffs are not rendered by default.

90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
Original file line number Diff line number Diff line change
Expand Up @@ -34,36 +34,36 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN_DBG-NEXT: s_mov_b32 s14, -1
; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
; GCN_DBG-NEXT: s_add_u32 s12, s12, s9
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
; GCN_DBG-NEXT: ; implicit-def: $vgpr0
; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
; GCN_DBG-NEXT: s_load_dword s1, s[2:3], 0xa
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
; GCN_DBG-NEXT: s_mov_b32 s0, 0
; GCN_DBG-NEXT: s_mov_b32 s2, -1
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec
; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec
; GCN_DBG-NEXT: s_mov_b64 exec, -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2
; GCN_DBG-NEXT: ; %bb.1: ; %for.exit
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: ; kill: killed $vgpr0
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB0_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
Expand All @@ -86,15 +86,15 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2
; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: ; kill: killed $vgpr0
; GCN_DBG-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -142,31 +142,31 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN_DBG-NEXT: s_mov_b32 s14, -1
; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
; GCN_DBG-NEXT: s_add_u32 s12, s12, s9
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
; GCN_DBG-NEXT: ; implicit-def: $vgpr0
; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_branch .LBB1_2
; GCN_DBG-NEXT: .LBB1_1: ; %for.exit
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: ; kill: killed $vgpr0
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB1_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
Expand All @@ -189,9 +189,9 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1
; GCN_DBG-NEXT: s_branch .LBB1_2
entry:
Expand Down Expand Up @@ -230,31 +230,31 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN_DBG-NEXT: s_mov_b32 s14, -1
; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
; GCN_DBG-NEXT: s_add_u32 s12, s12, s9
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
; GCN_DBG-NEXT: ; implicit-def: $vgpr0
; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_branch .LBB2_2
; GCN_DBG-NEXT: .LBB2_1: ; %for.exit
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: ; kill: killed $vgpr0
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB2_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
Expand All @@ -277,9 +277,9 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1
; GCN_DBG-NEXT: s_branch .LBB2_2
entry:
Expand Down Expand Up @@ -319,31 +319,31 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN_DBG-NEXT: s_mov_b32 s14, -1
; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
; GCN_DBG-NEXT: s_add_u32 s12, s12, s9
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
; GCN_DBG-NEXT: ; implicit-def: $vgpr0
; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_branch .LBB3_2
; GCN_DBG-NEXT: .LBB3_1: ; %for.exit
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: ; kill: killed $vgpr0
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB3_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: s_waitcnt expcnt(0)
; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
Expand All @@ -364,9 +364,9 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5]
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1
; GCN_DBG-NEXT: s_branch .LBB3_2
entry:
Expand Down Expand Up @@ -420,10 +420,10 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN_DBG-NEXT: s_mov_b32 s14, -1
; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000
; GCN_DBG-NEXT: s_add_u32 s12, s12, s9
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
; GCN_DBG-NEXT: ; implicit-def: $vgpr0
; GCN_DBG-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
; OPT-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx900
; OPT-NEXT: amdhsa.version:
; OPT-NEXT: - 1
; OPT-NEXT: - 2
; OPT-NEXT: - 1
; OPT-NEXT: ...
define internal i32 @func() {
ret i32 0
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s9
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
Expand Down Expand Up @@ -219,14 +219,14 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s9
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
Expand Down Expand Up @@ -428,14 +428,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s9
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -674,14 +674,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s9
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[8:9]
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x9
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; GCN-O0-NEXT: v_mov_b32_e32 v2, v1
; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-O0-NEXT: s_mov_b32 s0, 0
Expand Down Expand Up @@ -929,14 +929,14 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
; GCN-O0-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-O0-NEXT: s_mov_b32 s14, -1
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-O0-NEXT: s_add_u32 s12, s12, s9
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
; GCN-O0-NEXT: ; implicit-def: $vgpr1
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-O0-NEXT: s_mov_b64 exec, s[6:7]
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0
; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1
Expand Down
60 changes: 32 additions & 28 deletions llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -176,28 +176,30 @@ bb1:
define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-LABEL: v3i16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[8:9]
; GCN-NEXT: s_bitcmp1_b32 s12, 0
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_cbranch_vccnz .LBB4_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s6, 8
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s7, 0
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch .LBB4_3
; GCN-NEXT: .LBB4_2:
; GCN-NEXT: v_mov_b32_e32 v0, 0
Expand Down Expand Up @@ -225,28 +227,30 @@ if.end: ; preds = %if.else, %if.then
define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-LABEL: v3f16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GCN-NEXT: s_mov_b64 s[10:11], s[8:9]
; GCN-NEXT: s_load_dword s8, s[6:7], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GCN-NEXT: s_load_dword s12, s[8:9], 0x0
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s8, 0
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[8:9]
; GCN-NEXT: s_bitcmp1_b32 s12, 0
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[12:13]
; GCN-NEXT: s_cbranch_vccnz .LBB5_2
; GCN-NEXT: ; %bb.1: ; %if.else
; GCN-NEXT: s_add_u32 s8, s6, 8
; GCN-NEXT: s_add_u32 s8, s8, 8
; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GCN-NEXT: s_addc_u32 s9, s7, 0
; GCN-NEXT: s_addc_u32 s9, s9, 0
; GCN-NEXT: v_or3_b32 v31, v0, v1, v2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s12, s14
; GCN-NEXT: s_mov_b32 s13, s15
; GCN-NEXT: s_mov_b32 s14, s16
; GCN-NEXT: s_getpc_b64 s[18:19]
; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
; GCN-NEXT: s_branch .LBB5_3
; GCN-NEXT: .LBB5_2:
; GCN-NEXT: v_mov_b32_e32 v0, 0
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
;
; GFX9-LABEL: load_i8_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
Expand Down Expand Up @@ -1026,7 +1026,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: load_v2i8_to_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1116,7 +1116,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: load_v3i8_to_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1209,7 +1209,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: load_v4i8_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1334,7 +1334,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1497,7 +1497,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
;
; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1672,13 +1672,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
;
; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: s_movk_i32 s4, 0x900
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_movk_i32 s4, 0x900
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
Expand Down Expand Up @@ -1867,7 +1867,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: load_v7i8_to_v7f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2008,7 +2008,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: load_v8i8_to_v8f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2109,7 +2109,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
;
; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2194,7 +2194,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
;
; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2275,7 +2275,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
;
; GFX9-LABEL: i8_zext_i32_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
Expand Down Expand Up @@ -2389,7 +2389,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
;
; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2489,7 +2489,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
;
; GFX9-LABEL: extract_byte0_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2570,7 +2570,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
;
; GFX9-LABEL: extract_byte1_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2652,7 +2652,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
;
; GFX9-LABEL: extract_byte2_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2734,7 +2734,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
;
; GFX9-LABEL: extract_byte3_to_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -2823,7 +2823,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ define i32 @divergent_lshr_and_cmp(i32 %x) {
entry:
%0 = and i32 %x, 2
%1 = icmp ne i32 %0, 0
; Prevent removal of truncate in SDag by inserting llvm.amdgcn.if
br i1 %1, label %out.true, label %out.else

out.true:
Expand All @@ -42,9 +43,9 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %
; GCN-LABEL: name: uniform_opt_lshr_and_cmp
; GCN: bb.0.entry:
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; GCN-NEXT: liveins: $sgpr2_sgpr3
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
Expand Down Expand Up @@ -83,6 +84,7 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %
entry:
%0 = and i32 %x, 2
%1 = icmp ne i32 %0, 0
; Don't optimize the truncate in the SDag away.
br i1 %1, label %out.true, label %out.else

out.true:
Expand Down
60 changes: 32 additions & 28 deletions llvm/test/CodeGen/AMDGPU/ds_read2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1334,31 +1334,33 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_getpc_b64 s[40:41]
; CI-NEXT: s_mov_b32 s40, s0
; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0
; CI-NEXT: s_mov_b64 s[10:11], s[4:5]
; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x0
; CI-NEXT: s_load_dword s4, s[2:3], 0x2
; CI-NEXT: s_mov_b32 s14, s8
; CI-NEXT: s_mov_b32 s14, s10
; CI-NEXT: s_mov_b32 s12, s8
; CI-NEXT: s_mov_b32 s13, s9
; CI-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s40, s40, s9
; CI-NEXT: s_add_u32 s40, s40, s11
; CI-NEXT: s_mov_b64 s[10:11], s[6:7]
; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0
; CI-NEXT: s_load_dword s6, s[4:5], 0x2
; CI-NEXT: s_addc_u32 s41, s41, 0
; CI-NEXT: s_add_u32 s8, s2, 12
; CI-NEXT: s_addc_u32 s9, s3, 0
; CI-NEXT: s_getpc_b64 s[2:3]
; CI-NEXT: s_add_u32 s2, s2, void_func_void@gotpcrel32@lo+4
; CI-NEXT: s_addc_u32 s3, s3, void_func_void@gotpcrel32@hi+12
; CI-NEXT: v_add_i32_e32 v40, vcc, s4, v3
; CI-NEXT: s_add_u32 s8, s4, 12
; CI-NEXT: s_addc_u32 s9, s5, 0
; CI-NEXT: s_getpc_b64 s[4:5]
; CI-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_add_i32_e32 v40, vcc, s6, v3
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; CI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CI-NEXT: ds_read_b32 v41, v40
; CI-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; CI-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_mov_b64 s[0:1], s[40:41]
; CI-NEXT: v_or_b32_e32 v31, v0, v2
; CI-NEXT: s_mov_b32 s12, s6
; CI-NEXT: s_mov_b32 s13, s7
; CI-NEXT: s_mov_b64 s[2:3], s[42:43]
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_mov_b32 s39, 0xf000
Expand All @@ -1376,28 +1378,30 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
; GFX9-NEXT: s_getpc_b64 s[36:37]
; GFX9-NEXT: s_mov_b32 s36, s0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT: s_load_dword s4, s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s14, s8
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s12, s8
; GFX9-NEXT: s_mov_b32 s13, s9
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s9
; GFX9-NEXT: s_add_u32 s36, s36, s11
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: s_add_u32 s8, s2, 12
; GFX9-NEXT: s_addc_u32 s9, s3, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s4
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: s_add_u32 s8, s4, 12
; GFX9-NEXT: s_addc_u32 s9, s5, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s6
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; GFX9-NEXT: ds_read_b32 v42, v41
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_mov_b32_e32 v40, 0
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,19 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: s_mov_b64 s[38:39], s[8:9]
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47]
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[34:35]
; CHECK-NEXT: s_mov_b64 s[8:9], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/elf-header-osabi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

; NONE: OS/ABI: SystemV (0x0)
; HSA: OS/ABI: AMDGPU_HSA (0x40)
; HSA: ABIVersion: 3
; HSA: ABIVersion: 2
; PAL: OS/ABI: AMDGPU_PAL (0x41)
; PAL: ABIVersion: 0
; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42)
Expand Down
36 changes: 19 additions & 17 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,43 +62,45 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
define amdgpu_kernel void @kernel_calls_no_stack() {
; FLAT_SCR_OPT-LABEL: kernel_calls_no_stack:
; FLAT_SCR_OPT: ; %bb.0:
; FLAT_SCR_OPT-NEXT: s_add_u32 s6, s6, s11
; FLAT_SCR_OPT-NEXT: s_add_u32 s8, s8, s13
; FLAT_SCR_OPT-NEXT: s_mov_b32 s32, 0
; FLAT_SCR_OPT-NEXT: s_addc_u32 s7, s7, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s10
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[4:5]
; FLAT_SCR_OPT-NEXT: s_addc_u32 s9, s9, 0
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[4:5]
; FLAT_SCR_OPT-NEXT: s_getpc_b64 s[4:5]
; FLAT_SCR_OPT-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; FLAT_SCR_OPT-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; FLAT_SCR_OPT-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s9
; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s8
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1]
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[8:9], s[2:3]
; FLAT_SCR_OPT-NEXT: s_mov_b32 s14, s12
; FLAT_SCR_OPT-NEXT: s_mov_b32 s13, s11
; FLAT_SCR_OPT-NEXT: s_mov_b32 s12, s10
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[10:11], s[6:7]
; FLAT_SCR_OPT-NEXT: v_or3_b32 v31, v0, v1, v2
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[4:5], s[0:1]
; FLAT_SCR_OPT-NEXT: s_mov_b64 s[6:7], s[2:3]
; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[6:7]
; FLAT_SCR_OPT-NEXT: s_swappc_b64 s[30:31], s[16:17]
; FLAT_SCR_OPT-NEXT: s_endpgm
;
; FLAT_SCR_ARCH-LABEL: kernel_calls_no_stack:
; FLAT_SCR_ARCH: ; %bb.0:
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[4:5]
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s9
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s8
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[4:5]
; FLAT_SCR_ARCH-NEXT: s_getpc_b64 s[4:5]
; FLAT_SCR_ARCH-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; FLAT_SCR_ARCH-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; FLAT_SCR_ARCH-NEXT: v_lshlrev_b32_e32 v1, 10, v1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s8
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s14, s10
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[10:11], s[6:7]
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[4:5], s[0:1]
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[8:9], s[2:3]
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s12, s6
; FLAT_SCR_ARCH-NEXT: s_mov_b64 s[6:7], s[2:3]
; FLAT_SCR_ARCH-NEXT: v_or3_b32 v31, v0, v1, v2
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s13, s7
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s32, 0
; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_swappc_b64 s[30:31], s[16:17]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
; R600: |PV.{{[XYZW]}}|
; R600: -PV

; SI: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) {
%bc = bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
Expand Down
31 changes: 12 additions & 19 deletions llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
Expand Down Expand Up @@ -65,7 +64,6 @@ define amdgpu_kernel void @minimal_kernel_inputs() {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
Expand All @@ -83,7 +81,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
}

; GCN-LABEL: {{^}}queue_ptr:
; GCN: global_load_u8 v{{[0-9]+}},
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]

; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s2
Expand All @@ -93,12 +91,11 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
; WORKAROUND: .amdhsa_user_sgpr_count 15
; NOWORKAROUND: .amdhsa_user_sgpr_count 2
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
Expand All @@ -120,16 +117,16 @@ define amdgpu_kernel void @queue_ptr() {
; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15

; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s6
; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s7
; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s8
; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8
; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10

; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
; GCN: global_load_u8 v{{[0-9]+}},
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3]
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]

; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s4
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s5
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6
; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7

; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off
; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off
Expand All @@ -138,22 +135,21 @@ define amdgpu_kernel void @queue_ptr() {

; GCN: .amdhsa_kernel all_inputs
; WORKAROUND: .amdhsa_user_sgpr_count 13
; NOWORKAROUND: .amdhsa_user_sgpr_count 6
; NOWORKAROUND: .amdhsa_user_sgpr_count 8
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8
define amdgpu_kernel void @all_inputs() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
Expand Down Expand Up @@ -192,6 +188,3 @@ declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0

attributes #0 = { nounwind readnone speculatable willreturn }

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
1,155 changes: 605 additions & 550 deletions llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll

Large diffs are not rendered by default.

693 changes: 363 additions & 330 deletions llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll

Large diffs are not rendered by default.

693 changes: 363 additions & 330 deletions llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll

Large diffs are not rendered by default.

1,155 changes: 605 additions & 550 deletions llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll

Large diffs are not rendered by default.

38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-LABEL: name: extract_w_offset_vgpr
; GCN: bb.0.entry:
; GCN-NEXT: successors: %bb.1(0x80000000)
; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3
; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440
Expand Down Expand Up @@ -56,22 +56,22 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr2
; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr1
; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr0
; GCN-NEXT: undef %34.sub0:vreg_512 = COPY [[COPY1]]
; GCN-NEXT: %34.sub1:vreg_512 = COPY [[COPY2]]
; GCN-NEXT: %34.sub2:vreg_512 = COPY [[COPY3]]
; GCN-NEXT: %34.sub3:vreg_512 = COPY [[COPY4]]
; GCN-NEXT: %34.sub4:vreg_512 = COPY [[COPY5]]
; GCN-NEXT: %34.sub5:vreg_512 = COPY [[COPY6]]
; GCN-NEXT: %34.sub6:vreg_512 = COPY [[COPY7]]
; GCN-NEXT: %34.sub7:vreg_512 = COPY [[COPY8]]
; GCN-NEXT: %34.sub8:vreg_512 = COPY [[COPY9]]
; GCN-NEXT: %34.sub9:vreg_512 = COPY [[COPY10]]
; GCN-NEXT: %34.sub10:vreg_512 = COPY [[COPY11]]
; GCN-NEXT: %34.sub11:vreg_512 = COPY [[COPY12]]
; GCN-NEXT: %34.sub12:vreg_512 = COPY [[COPY13]]
; GCN-NEXT: %34.sub13:vreg_512 = COPY [[COPY14]]
; GCN-NEXT: %34.sub14:vreg_512 = COPY [[COPY15]]
; GCN-NEXT: %34.sub15:vreg_512 = COPY [[COPY16]]
; GCN-NEXT: undef %35.sub0:vreg_512 = COPY [[COPY1]]
; GCN-NEXT: %35.sub1:vreg_512 = COPY [[COPY2]]
; GCN-NEXT: %35.sub2:vreg_512 = COPY [[COPY3]]
; GCN-NEXT: %35.sub3:vreg_512 = COPY [[COPY4]]
; GCN-NEXT: %35.sub4:vreg_512 = COPY [[COPY5]]
; GCN-NEXT: %35.sub5:vreg_512 = COPY [[COPY6]]
; GCN-NEXT: %35.sub6:vreg_512 = COPY [[COPY7]]
; GCN-NEXT: %35.sub7:vreg_512 = COPY [[COPY8]]
; GCN-NEXT: %35.sub8:vreg_512 = COPY [[COPY9]]
; GCN-NEXT: %35.sub9:vreg_512 = COPY [[COPY10]]
; GCN-NEXT: %35.sub10:vreg_512 = COPY [[COPY11]]
; GCN-NEXT: %35.sub11:vreg_512 = COPY [[COPY12]]
; GCN-NEXT: %35.sub12:vreg_512 = COPY [[COPY13]]
; GCN-NEXT: %35.sub13:vreg_512 = COPY [[COPY14]]
; GCN-NEXT: %35.sub14:vreg_512 = COPY [[COPY15]]
; GCN-NEXT: %35.sub15:vreg_512 = COPY [[COPY16]]
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec
; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
Expand All @@ -85,7 +85,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %34, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
; GCN-NEXT: [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %35, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]]
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1
; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
Expand Down
116 changes: 59 additions & 57 deletions llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,140 +52,142 @@ define <2 x i64> @f1() #0 {
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5]
; GFX11-NEXT: v_mov_b32_e32 v31, v0
; GFX11-NEXT: s_load_b32 s21, s[6:7], 0x24
; GFX11-NEXT: s_mov_b32 s3, s14
; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24
; GFX11-NEXT: s_mov_b32 s18, s14
; GFX11-NEXT: s_mov_b32 s12, s13
; GFX11-NEXT: s_mov_b64 s[10:11], s[4:5]
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX11-NEXT: s_mov_b32 s16, 0
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s20, exec_lo
; GFX11-NEXT: s_mov_b32 s19, exec_lo
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, s21, v0
; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB2_13
; GFX11-NEXT: ; %bb.1: ; %bb14
; GFX11-NEXT: s_load_b128 s[16:19], s[6:7], 0x2c
; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s17, 0
; GFX11-NEXT: s_cselect_b32 s22, -1, 0
; GFX11-NEXT: s_bitcmp0_b32 s17, 0
; GFX11-NEXT: s_mov_b32 s17, 0
; GFX11-NEXT: s_bitcmp1_b32 s21, 0
; GFX11-NEXT: s_cselect_b32 s25, -1, 0
; GFX11-NEXT: s_bitcmp0_b32 s21, 0
; GFX11-NEXT: s_mov_b32 s21, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_3
; GFX11-NEXT: ; %bb.2: ; %bb15
; GFX11-NEXT: s_add_u32 s8, s6, 0x58
; GFX11-NEXT: s_addc_u32 s9, s7, 0
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
; GFX11-NEXT: s_addc_u32 s9, s17, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
; GFX11-NEXT: s_mov_b32 s13, s3
; GFX11-NEXT: s_mov_b32 s13, s18
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s14, s15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s1, -1
; GFX11-NEXT: s_cbranch_execz .LBB2_4
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_4
; GFX11-NEXT: s_branch .LBB2_12
; GFX11-NEXT: .LBB2_3:
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB2_4: ; %bb16
; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x54
; GFX11-NEXT: s_bitcmp1_b32 s19, 0
; GFX11-NEXT: s_mov_b32 s8, -1
; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54
; GFX11-NEXT: s_bitcmp1_b32 s23, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_and_b32 s9, s19, 1
; GFX11-NEXT: s_and_b32 s3, s23, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s9, 0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s3, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_8
; GFX11-NEXT: ; %bb.5: ; %bb18.preheader
; GFX11-NEXT: s_load_b128 s[24:27], s[6:7], 0x44
; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_hi_u32 s8, s25, s24
; GFX11-NEXT: s_mul_i32 s9, s25, s24
; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28
; GFX11-NEXT: s_mul_i32 s3, s29, s28
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s8, s9, 1
; GFX11-NEXT: s_mov_b32 s9, 0
; GFX11-NEXT: v_readfirstlane_b32 s8, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s22
; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s8, s8, 1
; GFX11-NEXT: s_lshr_b32 s8, s8, s26
; GFX11-NEXT: s_or_b32 s2, s2, 1
; GFX11-NEXT: s_lshr_b32 s2, s2, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s8, s8, s18
; GFX11-NEXT: s_mul_i32 s8, s8, s16
; GFX11-NEXT: s_mul_i32 s2, s2, s22
; GFX11-NEXT: s_mul_i32 s2, s2, s20
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s8, s21, s8
; GFX11-NEXT: s_lshl_b64 s[18:19], s[8:9], 1
; GFX11-NEXT: global_load_u16 v2, v1, s[18:19]
; GFX11-NEXT: s_or_b32 s2, s24, s2
; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1
; GFX11-NEXT: global_load_u16 v2, v1, s[22:23]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: v_cmp_ne_u16_e64 s8, s9, 0
; GFX11-NEXT: v_cmp_ne_u16_e64 s2, s3, 0
; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s8
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: s_and_b32 vcc_lo, s2, vcc_lo
; GFX11-NEXT: s_and_b32 vcc_lo, s8, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v2, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 vcc_lo, 0
; GFX11-NEXT: v_readfirstlane_b32 s8, v3
; GFX11-NEXT: v_readfirstlane_b32 s2, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: s_bitcmp1_b32 s8, 0
; GFX11-NEXT: s_cselect_b32 s8, 0x100, 0
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
; GFX11-NEXT: s_cselect_b32 s2, 0x100, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s9, s8, s9
; GFX11-NEXT: s_or_b32 s3, s2, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: .LBB2_8: ; %Flow12
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
; GFX11-NEXT: ; %bb.9:
; GFX11-NEXT: s_xor_b32 s0, s2, -1
; GFX11-NEXT: s_xor_b32 s0, s8, -1
; GFX11-NEXT: .LBB2_10: ; %bb17
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccz .LBB2_10
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s17, -1
; GFX11-NEXT: s_mov_b32 s21, -1
; GFX11-NEXT: .LBB2_12: ; %Flow11
; GFX11-NEXT: s_and_b32 s16, s1, exec_lo
; GFX11-NEXT: s_or_not1_b32 s0, s17, exec_lo
; GFX11-NEXT: s_and_b32 s3, s1, exec_lo
; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo
; GFX11-NEXT: .LBB2_13: ; %Flow9
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s20
; GFX11-NEXT: s_and_saveexec_b32 s17, s0
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s19, s0
; GFX11-NEXT: s_cbranch_execz .LBB2_15
; GFX11-NEXT: ; %bb.14: ; %bb43
; GFX11-NEXT: s_add_u32 s8, s6, 0x58
; GFX11-NEXT: s_addc_u32 s9, s7, 0
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
; GFX11-NEXT: s_addc_u32 s9, s17, 0
; GFX11-NEXT: s_getpc_b64 s[0:1]
; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4
; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
; GFX11-NEXT: s_mov_b32 s13, s3
; GFX11-NEXT: s_mov_b32 s13, s18
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s14, s15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_or_b32 s16, s16, exec_lo
; GFX11-NEXT: s_or_b32 s3, s3, exec_lo
; GFX11-NEXT: .LBB2_15: ; %Flow14
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s17
; GFX11-NEXT: s_and_saveexec_b32 s0, s16
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s0, s3
; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock
; GFX11-NEXT: ; divergent unreachable
; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock
Expand Down
57 changes: 32 additions & 25 deletions llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,30 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s32, 0x180000
; CHECK-NEXT: s_mov_b32 s33, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s12, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v0, s8, 0
; CHECK-NEXT: v_writelane_b32 v0, s8, 1
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
Expand All @@ -34,29 +42,28 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_add_i32 s8, s33, 0x100100
; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 s[16:17], 8
; CHECK-NEXT: s_mov_b32 s8, s6
; CHECK-NEXT: s_mov_b32 s6, s7
; CHECK-NEXT: s_mov_b32 s9, s16
; CHECK-NEXT: s_mov_b32 s7, s17
; CHECK-NEXT: s_add_u32 s8, s8, s9
; CHECK-NEXT: s_addc_u32 s6, s6, s7
; CHECK-NEXT: s_mov_b64 s[18:19], 8
; CHECK-NEXT: s_mov_b32 s8, s16
; CHECK-NEXT: s_mov_b32 s9, s17
; CHECK-NEXT: s_mov_b32 s16, s18
; CHECK-NEXT: s_mov_b32 s15, s19
; CHECK-NEXT: s_add_u32 s8, s8, s16
; CHECK-NEXT: s_addc_u32 s15, s9, s15
; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; CHECK-NEXT: s_mov_b32 s9, s6
; CHECK-NEXT: s_mov_b32 s9, s15
; CHECK-NEXT: v_mov_b32_e32 v0, 0x2000
; CHECK-NEXT: ; implicit-def: $sgpr6
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, device_func@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, device_func@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[6:7], 0x0
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, device_func@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, device_func@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s6, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v3, s6, v3
; CHECK-NEXT: s_mov_b32 s6, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2
; CHECK-NEXT: s_mov_b32 s15, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v3, s15, v3
; CHECK-NEXT: s_mov_b32 s15, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2
; CHECK-NEXT: v_or3_b32 v31, v1, v2, v3
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
Expand All @@ -69,7 +76,7 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: v_readlane_b32 s4, v0, 0
; CHECK-NEXT: v_readlane_b32 s4, v0, 1
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000
Expand Down
236 changes: 112 additions & 124 deletions llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll

Large diffs are not rendered by default.

70 changes: 22 additions & 48 deletions llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,18 @@ define void @func_use_lds_global() {
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_use_lds_global:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -103,18 +100,14 @@ define void @func_use_lds_global_constexpr_cast() {
; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_use_lds_global_constexpr_cast:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v0
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -166,34 +159,29 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], vcc, -1
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_2
; GFX8-SDAG-NEXT: ; %bb.1: ; %bb1
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: .LBB2_2: ; %Flow
; GFX8-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-SDAG-NEXT: s_cbranch_execz .LBB2_4
; GFX8-SDAG-NEXT: ; %bb.3: ; %bb0
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: .LBB2_4: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 2
; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_multi:
Expand All @@ -202,35 +190,29 @@ define void @func_uses_lds_multi(i1 %cond) {
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX8-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_2
; GFX8-GISEL-NEXT: ; %bb.1: ; %bb1
; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 1
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
; GFX8-GISEL-NEXT: .LBB2_2: ; %Flow
; GFX8-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX8-GISEL-NEXT: s_cbranch_execz .LBB2_4
; GFX8-GISEL-NEXT: ; %bb.3: ; %bb0
; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
; GFX8-GISEL-NEXT: .LBB2_4: ; %ret
; GFX8-GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 2
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -379,23 +361,19 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: ds_write_b32 v0, v2
; GFX8-SDAG-NEXT: s_mov_b64 s[4:5], 0
; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, 1
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_code_after:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[4:5], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 1
Expand Down Expand Up @@ -472,16 +450,14 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-SDAG-NEXT: ; %bb.1: ; %use.bb
; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX8-SDAG-NEXT: s_mov_b32 m0, -1
; GFX8-SDAG-NEXT: s_mov_b64 s[6:7], 0
; GFX8-SDAG-NEXT: ds_write_b32 v0, v0
; GFX8-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-SDAG-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-SDAG-NEXT: s_trap 2
; GFX8-SDAG-NEXT: flat_load_dword v0, v[1:2] glc
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: .LBB4_2: ; %ret
; GFX8-SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
Expand All @@ -495,11 +471,9 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
; GFX8-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8-GISEL-NEXT: s_cbranch_execz .LBB4_2
; GFX8-GISEL-NEXT: ; %bb.1: ; %use.bb
; GFX8-GISEL-NEXT: s_mov_b64 s[6:7], 0xc8
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX8-GISEL-NEXT: s_mov_b32 m0, -1
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX8-GISEL-NEXT: s_trap 2
; GFX8-GISEL-NEXT: ds_write_b32 v0, v0
; GFX8-GISEL-NEXT: flat_load_dword v0, v[1:2] glc
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIH %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

; GCN-LABEL: {{^}}is_private_vgpr:
; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]]
; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]]
; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]]
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]]

; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base
; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]]
Expand All @@ -26,12 +25,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; select and vcc branch.

; GCN-LABEL: {{^}}is_private_sgpr:
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x1{{$}}
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}

; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x32{{$}}
; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}}

; CI: s_cmp_eq_u32 [[APERTURE]], [[PTR_HI]]
; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]

; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_private_base
; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]]
Expand Down
15 changes: 7 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIH %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

; GCN-LABEL: {{^}}is_local_vgpr:
; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]]
; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10

; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]]

; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]]
; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]]
; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]]
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
%id = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -26,15 +25,15 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; select and vcc branch.

; GCN-LABEL: {{^}}is_local_sgpr:
; CI-DAG: s_load_dword s0, s[4:5], 0x1
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}

; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x33{{$}}
; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}}

; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]]

; CI: s_cmp_eq_u32 s0, [[PTR_HI]]
; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

; GCN-LABEL: {{^}}test_debug_value:
; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42
; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; NOOPT-NEXT: .Ltmp
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5

Expand Down
236 changes: 118 additions & 118 deletions llvm/test/CodeGen/AMDGPU/lower-kernargs.ll

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f0@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12
Expand Down Expand Up @@ -212,12 +212,14 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 1
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
Expand Down Expand Up @@ -246,7 +248,7 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_mov_b64 s[8:9], s[4:5]
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
Expand Down
10 changes: 9 additions & 1 deletion llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=OPT %s
; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefix=GCN %s

Expand Down Expand Up @@ -239,15 +238,18 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_addc_u32 s5, s5, f0@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 0
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
; GCN: .amdhsa_group_segment_fixed_size 8
call void @f0()
call void @f1()
ret void
Expand All @@ -274,15 +276,18 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 2
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
; GCN: .amdhsa_group_segment_fixed_size 16
call void @f2()
call void @f3()
ret void
Expand Down Expand Up @@ -313,6 +318,7 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_addc_u32 s5, s5, f1@gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 1
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -325,8 +331,10 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GCN-NEXT: ds_write_b8 v0, v1 offset:2
; GCN-NEXT: s_mov_b64 s[6:7], s[8:9]
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
; GCN: .amdhsa_group_segment_fixed_size 16
call void @f1()
%ld = load i8, ptr addrspace(3) @v3
%mul = mul i8 %ld, 8
Expand Down
92 changes: 44 additions & 48 deletions llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,20 @@ store i32 0, ptr addrspace(3) @used_by_kernel
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_mov_b32 s15, 0xe00000
; GFX9-NEXT: s_add_u32 s12, s12, s3
; GFX9-NEXT: s_addc_u32 s13, s13, 0
; GFX9-NEXT: s_add_u32 s8, s0, 36
; GFX9-NEXT: s_addc_u32 s9, s1, 0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: s_mov_b64 s[0:1], s[12:13]
; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-NEXT: s_add_u32 s8, s8, s3
; GFX9-NEXT: s_addc_u32 s9, s9, 0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b64 s[2:3], s[14:15]
; GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -51,21 +50,20 @@ define amdgpu_kernel void @withcall() {
;
; GFX10-LABEL: withcall:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s14, -1
; GFX10-NEXT: s_mov_b32 s15, 0x31c16000
; GFX10-NEXT: s_add_u32 s12, s12, s3
; GFX10-NEXT: s_addc_u32 s13, s13, 0
; GFX10-NEXT: s_add_u32 s8, s0, 36
; GFX10-NEXT: s_addc_u32 s9, s1, 0
; GFX10-NEXT: s_getpc_b64 s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s10, -1
; GFX10-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-NEXT: s_add_u32 s8, s8, s3
; GFX10-NEXT: s_addc_u32 s9, s9, 0
; GFX10-NEXT: s_getpc_b64 s[2:3]
; GFX10-NEXT: s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-NEXT: s_mov_b64 s[0:1], s[12:13]
; GFX10-NEXT: s_mov_b64 s[2:3], s[14:15]
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
; GFX10-NEXT: s_mov_b32 s32, 0
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -74,22 +72,21 @@ define amdgpu_kernel void @withcall() {
;
; G_GFX9-LABEL: withcall:
; G_GFX9: ; %bb.0:
; G_GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; G_GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; G_GFX9-NEXT: s_mov_b32 s14, -1
; G_GFX9-NEXT: s_mov_b32 s15, 0xe00000
; G_GFX9-NEXT: s_add_u32 s12, s12, s3
; G_GFX9-NEXT: s_addc_u32 s13, s13, 0
; G_GFX9-NEXT: s_add_u32 s8, s0, 36
; G_GFX9-NEXT: s_addc_u32 s9, s1, 0
; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; G_GFX9-NEXT: s_mov_b32 s10, -1
; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000
; G_GFX9-NEXT: s_add_u32 s8, s8, s3
; G_GFX9-NEXT: s_addc_u32 s9, s9, 0
; G_GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
; G_GFX9-NEXT: s_getpc_b64 s[0:1]
; G_GFX9-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX9-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; G_GFX9-NEXT: s_mov_b64 s[0:1], s[12:13]
; G_GFX9-NEXT: s_mov_b64 s[0:1], s[8:9]
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
; G_GFX9-NEXT: v_mov_b32_e32 v1, 8
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[14:15]
; G_GFX9-NEXT: s_mov_b64 s[2:3], s[10:11]
; G_GFX9-NEXT: s_mov_b32 s32, 0
; G_GFX9-NEXT: ds_write_b32 v1, v0
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -98,22 +95,21 @@ define amdgpu_kernel void @withcall() {
;
; G_GFX10-LABEL: withcall:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; G_GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; G_GFX10-NEXT: s_mov_b32 s14, -1
; G_GFX10-NEXT: s_mov_b32 s15, 0x31c16000
; G_GFX10-NEXT: s_add_u32 s12, s12, s3
; G_GFX10-NEXT: s_addc_u32 s13, s13, 0
; G_GFX10-NEXT: s_add_u32 s8, s0, 36
; G_GFX10-NEXT: s_addc_u32 s9, s1, 0
; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; G_GFX10-NEXT: s_mov_b32 s10, -1
; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
; G_GFX10-NEXT: s_add_u32 s8, s8, s3
; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: s_mov_b64 s[6:7], s[0:1]
; G_GFX10-NEXT: s_getpc_b64 s[0:1]
; G_GFX10-NEXT: s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX10-NEXT: s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
; G_GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 8
; G_GFX10-NEXT: s_mov_b64 s[0:1], s[12:13]
; G_GFX10-NEXT: s_mov_b64 s[2:3], s[14:15]
; G_GFX10-NEXT: s_mov_b64 s[0:1], s[8:9]
; G_GFX10-NEXT: s_mov_b64 s[2:3], s[10:11]
; G_GFX10-NEXT: s_mov_b32 s32, 0
; G_GFX10-NEXT: ds_write_b32 v1, v0
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
Expand Down
94 changes: 61 additions & 33 deletions llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,28 +66,37 @@ bb:
define amdgpu_kernel void @kernel_call() {
; CHECK-LABEL: kernel_call:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_callee@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_callee@rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s6, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2
; CHECK-NEXT: s_mov_b32 s6, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1
; CHECK-NEXT: s_mov_b32 s15, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2
; CHECK-NEXT: s_mov_b32 s15, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_callee()
Expand Down Expand Up @@ -125,28 +134,37 @@ bb:
define amdgpu_kernel void @kernel_tailcall() {
; CHECK-LABEL: kernel_tailcall:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, csr_vgpr_spill_fp_tailcall_callee@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, csr_vgpr_spill_fp_tailcall_callee@rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s6, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2
; CHECK-NEXT: s_mov_b32 s6, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1
; CHECK-NEXT: s_mov_b32 s15, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2
; CHECK-NEXT: s_mov_b32 s15, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
Expand Down Expand Up @@ -237,28 +255,38 @@ entry:
define protected amdgpu_kernel void @kernel() {
; CHECK-LABEL: kernel:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: s_mov_b64 s[8:9], s[6:7]
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, caller_save_vgpr_spill_fp@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, caller_save_vgpr_spill_fp@rel32@hi+12
; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b32 s6, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s6, v2
; CHECK-NEXT: s_mov_b32 s6, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s6, v1
; CHECK-NEXT: s_mov_b32 s15, 20
; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2
; CHECK-NEXT: s_mov_b32 s15, 10
; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1
; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2
; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr0 killed $exec
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
entry:
%call = call i32 @caller_save_vgpr_spill_fp()
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s94, -1
; GCN-NEXT: s_mov_b32 s95, 0xe8f000
; GCN-NEXT: s_add_u32 s92, s92, s9
; GCN-NEXT: s_add_u32 s92, s92, s11
; GCN-NEXT: s_addc_u32 s93, s93, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: ; implicit-def: $vgpr2
; GCN-NEXT: s_load_dword s0, s[2:3], 0xb
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[4:11]
; GCN-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -486,11 +486,11 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s54, -1
; GCN-NEXT: s_mov_b32 s55, 0xe8f000
; GCN-NEXT: s_add_u32 s52, s52, s9
; GCN-NEXT: s_add_u32 s52, s52, s11
; GCN-NEXT: s_addc_u32 s53, s53, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr1
; GCN-NEXT: s_load_dword s0, s[2:3], 0xb
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[4:19]
; GCN-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -736,11 +736,11 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s54, -1
; GCN-NEXT: s_mov_b32 s55, 0xe8f000
; GCN-NEXT: s_add_u32 s52, s52, s9
; GCN-NEXT: s_add_u32 s52, s52, s11
; GCN-NEXT: s_addc_u32 s53, s53, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: s_load_dword s0, s[2:3], 0xb
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
Expand Down Expand Up @@ -987,11 +987,11 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s54, -1
; GCN-NEXT: s_mov_b32 s55, 0xe8f000
; GCN-NEXT: s_add_u32 s52, s52, s9
; GCN-NEXT: s_add_u32 s52, s52, s11
; GCN-NEXT: s_addc_u32 s53, s53, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: s_load_dword s0, s[2:3], 0x9
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
Expand Down
Loading