833 changes: 530 additions & 303 deletions llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


; GCN-LABEL: {{^}}divergent_if_endif:
; VGPR: workitem_private_segment_byte_size = 12{{$}}
; VGPR: workitem_private_segment_byte_size = 16{{$}}


; GCN: {{^}}; %bb.0:
Expand All @@ -19,7 +19,7 @@

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}}
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}}

; Spill saved exec
; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
Expand Down Expand Up @@ -82,13 +82,13 @@ endif:
}

; GCN-LABEL: {{^}}divergent_loop:
; VGPR: workitem_private_segment_byte_size = 16{{$}}
; VGPR: workitem_private_segment_byte_size = 20{{$}}

; GCN: {{^}}; %bb.0:
; GCN-DAG: s_mov_b32 m0, -1
; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}}
; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]]
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}}
; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}}

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
Expand Down Expand Up @@ -166,7 +166,7 @@ end:
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]]
; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]]

; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec
; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]]
Expand All @@ -175,6 +175,7 @@ end:
; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
Expand All @@ -187,6 +188,7 @@ end:
; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]

; GCN: [[FLOW]]: ; %Flow
; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,20 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -69,22 +67,20 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -107,22 +103,20 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -145,11 +139,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v40, s16, 2
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4
Expand All @@ -158,10 +151,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 16
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
Expand All @@ -48,7 +48,6 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_writelane_b32 v40, s47, 15
; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v42, s16, 0
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
Expand Down Expand Up @@ -92,10 +91,9 @@ define weak_odr void @test(i32 %0) !dbg !34 {
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v42, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 16
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
Expand Down
279 changes: 279 additions & 0 deletions llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -start-before=si-lower-sgpr-spills -stop-after=virtregrewriter,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s

# Tests to check the conservative lieness extension for the wwm registers during SGPR spill lowering.

# Even though the VGPR can be shared for the wwm-operand (writelane/readlane get inserted for the SGPR spills)
# and the regular operand (%0), they get different registers as we conservatively extend the liveness of the
# wwm-operands.
---
name: test_single_block
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
stack:
- { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
hasSpilledSGPRs: true
body: |
bb.0:
liveins: $sgpr4, $vgpr2_vgpr3
; GCN-LABEL: name: test_single_block
; GCN: liveins: $sgpr4, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: KILL killed renamable $vgpr0
; GCN-NEXT: SI_RETURN
SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
S_NOP 0
renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
%0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec
SI_RETURN
...

# Due to the presence of wwm-operand in the divergent flow, the regular variable (%0) shouldn't get the same register
# allocated for the wwm-operand in writelane/readlane when the SGPR spill is lowered.

---
name: test_if_else
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
stack:
- { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
hasSpilledSGPRs: true
body: |
; GCN-LABEL: name: test_if_else
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
; GCN-NEXT: KILL killed renamable $vgpr0
; GCN-NEXT: SI_RETURN
bb.0:
liveins: $sgpr6, $sgpr10_sgpr11
S_BRANCH %bb.1
bb.1:
liveins: $sgpr6, $sgpr10_sgpr11
%0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
S_CBRANCH_EXECZ %bb.3, implicit $exec
bb.2:
liveins: $sgpr6, $sgpr10_sgpr11
SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
S_NOP 0
renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
%0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
S_BRANCH %bb.3
bb.3:
liveins: $sgpr10_sgpr11
$sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0
SI_RETURN
...

# The wwm-register usage outside the loop should have the interference marked with
# all the regular virtual registers used in the test. The divergent loop index value (%1)
# can actually share the same VGPR as the wwm-operand. But since we extend the liveness of
# the wwm operand, an interference will always exist between them.

---
name: test_loop
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
stack:
- { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
hasSpilledSGPRs: true
body: |
; GCN-LABEL: name: test_loop
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: S_BRANCH %bb.2
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0
; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec
; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec
; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
; GCN-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000)
; GCN-NEXT: liveins: $vgpr0, $vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec
; GCN-NEXT: $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GCN-NEXT: S_CBRANCH_SCC1 %bb.5, implicit $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.4:
; GCN-NEXT: successors: %bb.3(0x80000000)
; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; GCN-NEXT: S_BRANCH %bb.3
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.5:
; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7
; GCN-NEXT: {{ $}}
; GCN-NEXT: $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc
; GCN-NEXT: KILL killed renamable $vgpr0
; GCN-NEXT: SI_RETURN
bb.0:
liveins: $sgpr4, $sgpr10_sgpr11
%0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
liveins: $sgpr4, $sgpr10_sgpr11
SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
S_NOP 0
renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
%0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
S_BRANCH %bb.2
bb.2:
liveins: $sgpr4, $sgpr10_sgpr11
S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0
$sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec
S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4
%1:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
S_CBRANCH_EXECZ %bb.3, implicit $exec
S_BRANCH %bb.3
bb.3:
$vcc = V_CMP_EQ_U32_e64 0, %1:vgpr_32, implicit $exec
$sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
S_CBRANCH_SCC1 %bb.5, implicit $scc
bb.4:
liveins: $sgpr6_sgpr7
%2:vgpr_32 = V_SUB_U32_e32 1, %1:vgpr_32, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 %2:vgpr_32, implicit $exec
S_BRANCH %bb.3
bb.5:
liveins: $sgpr6_sgpr7
$exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc
SI_RETURN
...

# There must be one KILL instruction for the wwm-operand in every return block.
# Due to that, the wwm-register allocated should be different from the ones
# allocated for the regular virtual registers.

---
name: test_multiple_return_blocks
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
stack:
- { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr33'
hasSpilledSGPRs: true
body: |
; GCN-LABEL: name: test_multiple_return_blocks
; GCN: bb.0:
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
; GCN-NEXT: S_NOP 0
; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: KILL killed renamable $vgpr0
; GCN-NEXT: SI_RETURN
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: liveins: $vgpr0, $vgpr2_vgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
; GCN-NEXT: KILL killed renamable $vgpr0
; GCN-NEXT: SI_RETURN
bb.0:
liveins: $sgpr4, $vgpr2_vgpr3
S_CBRANCH_EXECZ %bb.2, implicit $exec
bb.1:
liveins: $sgpr4, $vgpr2_vgpr3
SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
S_NOP 0
renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
%0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec
GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec
SI_RETURN
bb.2:
liveins: $vgpr2_vgpr3
%1:vgpr_32 = V_MOV_B32_e32 20, implicit $exec
GLOBAL_STORE_DWORD $vgpr2_vgpr3, %1:vgpr_32, 0, 0, implicit $exec
SI_RETURN
...
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: s_addk_i32 s32, 0x800
; GCN-NEXT: v_writelane_b32 v43, s16, 0
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v42, s16, 2
; GCN-NEXT: s_getpc_b64 s[16:17]
; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
Expand All @@ -39,12 +38,11 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: v_readlane_b32 s31, v42, 1
; GCN-NEXT: v_readlane_b32 s30, v42, 0
; GCN-NEXT: v_readlane_b32 s4, v43, 0
; GCN-NEXT: v_readlane_b32 s4, v42, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xf800
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand Down
74 changes: 24 additions & 50 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,21 +117,15 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, exec_lo
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5
; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0
; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 4
; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_OPT-NEXT: scratch_load_dword v0, off, s5
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s4
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8
; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1
; FLAT_SCR_OPT-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -228,44 +222,31 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_OPT-NEXT: ;;#ASMEND
; FLAT_SCR_OPT-NEXT: ;;#ASMSTART
; FLAT_SCR_OPT-NEXT: ;;#ASMEND
; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v1, s3
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0
; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1
; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3
; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2
; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0
; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1]
; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0
; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1
; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1]
; FLAT_SCR_OPT-NEXT: s_endpgm
;
; FLAT_SCR_ARCH-LABEL: test:
; FLAT_SCR_ARCH: ; %bb.0:
; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, exec_lo
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5
; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0
; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0)
; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 4
; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill
; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v0, off, s5
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s4
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8
; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1
; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -362,24 +343,17 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
; FLAT_SCR_ARCH-NEXT: ;;#ASMEND
; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART
; FLAT_SCR_ARCH-NEXT: ;;#ASMEND
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v1, s3
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0
; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1
; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3
; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0)
; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3
; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2
; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0
; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1]
; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0
; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1
; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1]
; FLAT_SCR_ARCH-NEXT: s_endpgm
call void asm sideeffect "", "~{s[0:7]}" ()
call void asm sideeffect "", "~{s[8:15]}" ()
Expand Down
46 changes: 20 additions & 26 deletions llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec_lo
Expand All @@ -38,14 +37,13 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec_hi
Expand All @@ -64,17 +62,16 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def $exec
; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def $exec
Expand All @@ -96,13 +93,12 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo
Expand All @@ -120,13 +116,12 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi
Expand All @@ -144,16 +139,15 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1
; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1
; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ body: |
bb.0:
; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def $m0
; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
Expand All @@ -44,13 +43,12 @@ body: |
bb.0:
; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK: renamable $vgpr0 = IMPLICIT_DEF
; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0
; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0
; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0
; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0
; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[8:9], -1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9]
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s4, 2
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0
; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s4, 0
; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1
; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33
; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -29,10 +28,9 @@ define void @callee_with_stack_and_call() #0 {
; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5]
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v41, 0
; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2
; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1
; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7]
; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00
; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-LABEL: gfx_func:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: s_mov_b32 s38, s33
; SDAG-NEXT: s_mov_b32 s36, s33
; SDAG-NEXT: s_mov_b32 s33, s32
; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -81,14 +81,14 @@ define amdgpu_gfx void @gfx_func() {
; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; SDAG-NEXT: s_mov_b64 exec, s[34:35]
; SDAG-NEXT: s_addk_i32 s32, 0xfc00
; SDAG-NEXT: s_mov_b32 s33, s38
; SDAG-NEXT: s_mov_b32 s33, s36
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: gfx_func:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s38, s33
; GISEL-NEXT: s_mov_b32 s36, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -161,7 +161,7 @@ define amdgpu_gfx void @gfx_func() {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[34:35]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s38
; GISEL-NEXT: s_mov_b32 s33, s36
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
call void @extern_c_func()
Expand Down
4,153 changes: 1,543 additions & 2,610 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll

Large diffs are not rendered by default.

396 changes: 152 additions & 244 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll

Large diffs are not rendered by default.

112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Large diffs are not rendered by default.

72 changes: 28 additions & 44 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,8 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -466,10 +465,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -483,9 +481,8 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -552,10 +549,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -573,9 +569,8 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -645,10 +640,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -662,9 +656,8 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -732,10 +725,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -753,9 +745,8 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s16, 18
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -824,10 +815,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 18
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -841,9 +831,8 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: v_writelane_b32 v40, s16, 18
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -912,10 +901,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 18
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand All @@ -934,9 +922,8 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[18:19]
; GCN-NEXT: v_writelane_b32 v41, s16, 0
; GCN-NEXT: v_writelane_b32 v40, s16, 20
; GCN-NEXT: s_addk_i32 s32, 0x400
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -1014,10 +1001,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GCN-NEXT: v_readlane_b32 s34, v40, 2
; GCN-NEXT: v_readlane_b32 s31, v40, 1
; GCN-NEXT: v_readlane_b32 s30, v40, 0
; GCN-NEXT: v_readlane_b32 s4, v41, 0
; GCN-NEXT: v_readlane_b32 s4, v40, 20
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s4
Expand All @@ -1031,9 +1017,8 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GISEL-NEXT: s_mov_b64 exec, s[18:19]
; GISEL-NEXT: v_writelane_b32 v41, s16, 0
; GISEL-NEXT: v_writelane_b32 v40, s16, 20
; GISEL-NEXT: s_addk_i32 s32, 0x400
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
Expand Down Expand Up @@ -1111,10 +1096,9 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
; GISEL-NEXT: v_readlane_b32 s34, v40, 2
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
; GISEL-NEXT: v_readlane_b32 s4, v41, 0
; GISEL-NEXT: v_readlane_b32 s4, v40, 20
; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[6:7]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s4
Expand Down Expand Up @@ -1327,7 +1311,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1418,14 +1402,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1516,7 +1500,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
call amdgpu_gfx void %fptr(i32 %i)
Expand All @@ -1531,7 +1515,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1620,14 +1604,14 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1716,7 +1700,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call amdgpu_gfx i32 %fptr(i32 %i)
Expand All @@ -1728,7 +1712,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s12, s33
; GCN-NEXT: s_mov_b32 s10, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1814,14 +1798,14 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, s12
; GCN-NEXT: s_mov_b32 s33, s10
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s12, s33
; GISEL-NEXT: s_mov_b32 s10, s33
; GISEL-NEXT: s_mov_b32 s33, s32
; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1
; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
Expand Down Expand Up @@ -1907,7 +1891,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GISEL-NEXT: s_mov_b64 exec, s[4:5]
; GISEL-NEXT: s_addk_i32 s32, 0xfc00
; GISEL-NEXT: s_mov_b32 s33, s12
; GISEL-NEXT: s_mov_b32 s33, s10
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_setpc_b64 s[30:31]
tail call amdgpu_gfx void %fptr()
Expand Down
73 changes: 37 additions & 36 deletions llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ define void @f0() {
; GFX11-LABEL: f0:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s3, s33
; GFX11-NEXT: s_mov_b32 s2, s33
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
Expand All @@ -28,7 +28,7 @@ define void @f0() {
; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_add_i32 s32, s32, -16
; GFX11-NEXT: s_mov_b32 s33, s3
; GFX11-NEXT: s_mov_b32 s33, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -53,28 +53,28 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-LABEL: f2:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5]
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x24
; GFX11-NEXT: v_mov_b32_e32 v31, v0
; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24
; GFX11-NEXT: s_mov_b32 s18, s14
; GFX11-NEXT: s_mov_b32 s12, s13
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX11-NEXT: s_mov_b32 s20, 0
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s0, -1
; GFX11-NEXT: s_mov_b32 s19, exec_lo
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX11-NEXT: s_cbranch_execz .LBB2_13
; GFX11-NEXT: ; %bb.1: ; %bb14
; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s21, 0
; GFX11-NEXT: s_cselect_b32 s24, -1, 0
; GFX11-NEXT: s_cselect_b32 s25, -1, 0
; GFX11-NEXT: s_bitcmp0_b32 s21, 0
; GFX11-NEXT: s_mov_b32 s21, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_3
Expand All @@ -90,40 +90,41 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_mov_b32 s1, -1
; GFX11-NEXT: s_cbranch_execz .LBB2_4
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_4
; GFX11-NEXT: s_branch .LBB2_12
; GFX11-NEXT: .LBB2_3:
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: .LBB2_4: ; %bb16
; GFX11-NEXT: s_load_b32 s3, s[16:17], 0x54
; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54
; GFX11-NEXT: s_bitcmp1_b32 s23, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_and_b32 s9, s23, 1
; GFX11-NEXT: s_and_b32 s3, s23, 1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_bitcmp1_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s3, -1
; GFX11-NEXT: s_bitcmp1_b32 s2, 0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
; GFX11-NEXT: s_cmp_eq_u32 s9, 0
; GFX11-NEXT: s_cmp_eq_u32 s3, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB2_8
; GFX11-NEXT: ; %bb.5: ; %bb18.preheader
; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mul_hi_u32 s3, s29, s28
; GFX11-NEXT: s_mul_i32 s9, s29, s28
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s3, s9, 1
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s3, s3, 1
; GFX11-NEXT: s_lshr_b32 s3, s3, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s9, s3, s22
; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28
; GFX11-NEXT: s_mul_i32 s3, s29, s28
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mul_i32 s9, s9, s20
; GFX11-NEXT: s_or_b32 s2, s2, s9
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s2, 1
; GFX11-NEXT: s_lshr_b32 s2, s2, s30
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s2, s2, s22
; GFX11-NEXT: s_mul_i32 s2, s2, s20
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s24, s2
; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1
; GFX11-NEXT: global_load_u16 v2, v1, s[22:23]
; GFX11-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -151,10 +152,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_or_b32 s3, s2, s3
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
; GFX11-NEXT: ; %bb.7: ; %Flow
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: .LBB2_8: ; %Flow12
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s3
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
; GFX11-NEXT: ; %bb.9:
; GFX11-NEXT: s_xor_b32 s0, s8, -1
Expand All @@ -166,11 +167,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: ; %bb.11: ; %Flow6
; GFX11-NEXT: s_mov_b32 s21, -1
; GFX11-NEXT: .LBB2_12: ; %Flow11
; GFX11-NEXT: s_and_b32 s20, s1, exec_lo
; GFX11-NEXT: s_and_b32 s3, s1, exec_lo
; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo
; GFX11-NEXT: .LBB2_13: ; %Flow9
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s2, s0
; GFX11-NEXT: s_and_saveexec_b32 s19, s0
; GFX11-NEXT: s_cbranch_execz .LBB2_15
; GFX11-NEXT: ; %bb.14: ; %bb43
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
Expand All @@ -183,10 +184,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: s_mov_b32 s14, s15
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_or_b32 s20, s20, exec_lo
; GFX11-NEXT: s_or_b32 s3, s3, exec_lo
; GFX11-NEXT: .LBB2_15: ; %Flow14
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX11-NEXT: s_and_saveexec_b32 s0, s20
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19
; GFX11-NEXT: s_and_saveexec_b32 s0, s3
; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock
; GFX11-NEXT: ; divergent unreachable
; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock
Expand Down
44 changes: 36 additions & 8 deletions llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,30 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_writelane_b32 v40, s16, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s12, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v40, 0
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v40, s8, 1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v0, s8, 1
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def vgpr10
; CHECK-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -56,24 +69,39 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s4, v40, 1
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: v_readlane_b32 s4, v0, 1
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %store
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100000
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b32 v0, v1
; CHECK-NEXT: ds_write_b32 v1, v2
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .LBB0_2: ; %end
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
%arr = alloca < 1339 x i32>, align 8192, addrspace(5)
%cmp = icmp ne i32 %val, 0
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI lower SGPR spill instructions
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: SI Fix VGPR copies
; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis
; GCN-O0-NEXT: Fixup Statepoint Caller Saved
Expand Down Expand Up @@ -360,6 +361,7 @@
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: GCN NSA Reassign
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -653,6 +655,7 @@
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -956,6 +959,7 @@
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: GCN NSA Reassign
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: Stack Slot Coloring
Expand Down Expand Up @@ -1271,6 +1275,7 @@
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: GCN NSA Reassign
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: Stack Slot Coloring
Expand Down
590 changes: 349 additions & 241 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll

Large diffs are not rendered by default.

677 changes: 396 additions & 281 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,12 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: v_writelane_b32 v40, s4, 5
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v44, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s36, 3
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
Expand Down Expand Up @@ -230,10 +229,9 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s4, v44, 0
; GFX9-NEXT: v_readlane_b32 s4, v40, 5
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_addk_i32 s32, 0xf800
; GFX9-NEXT: s_mov_b32 s33, s4
Expand Down
47 changes: 36 additions & 11 deletions llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s24, s33
; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
Expand All @@ -54,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s24
; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -66,12 +66,16 @@ bb:
define amdgpu_kernel void @kernel_call() {
; CHECK-LABEL: kernel_call:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -89,6 +93,10 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_callee()
Expand All @@ -113,9 +121,9 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12
; CHECK-NEXT: v_readlane_b32 s33, v1, 0
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: s_xor_saveexec_b64 s[20:21], -1
; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[20:21]
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: s_setpc_b64 s[16:17]
bb:
call void asm sideeffect "; clobber csr v40", "~{v40}"()
Expand All @@ -126,12 +134,16 @@ bb:
define amdgpu_kernel void @kernel_tailcall() {
; CHECK-LABEL: kernel_tailcall:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -149,6 +161,10 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
bb:
tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee()
Expand All @@ -172,7 +188,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s24, s33
; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
Expand All @@ -194,7 +210,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s24
; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
Expand All @@ -206,7 +222,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s25, s33
; CHECK-NEXT: s_mov_b32 s19, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
Expand All @@ -228,7 +244,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s25
; CHECK-NEXT: s_mov_b32 s33, s19
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
Expand All @@ -239,12 +255,16 @@ entry:
define protected amdgpu_kernel void @kernel() {
; CHECK-LABEL: kernel:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_mov_b32 s32, 0x400
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: v_writelane_b32 v3, s16, 0
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: s_mov_b32 s13, s15
; CHECK-NEXT: s_mov_b32 s12, s14
; CHECK-NEXT: v_readlane_b32 s14, v3, 0
Expand All @@ -262,6 +282,11 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr0 killed $exec
; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[24:25]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
entry:
%call = call i32 @caller_save_vgpr_spill_fp()
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AMDGPU/nested-calls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ declare void @external_void_func_i32(i32) #0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0
; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
; GCN-DAG: v_writelane_b32 v40, s30, 0
; GCN-DAG: v_writelane_b32 v40, s31, 1

Expand All @@ -26,10 +25,9 @@ declare void @external_void_func_i32(i32) #0
; GCN: v_readlane_b32 s31, v40, 1
; GCN: v_readlane_b32 s30, v40, 0

; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_addk_i32 s32, 0xfc00
; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v41, s16, 0
; CHECK-NEXT: v_writelane_b32 v40, s16, 2
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
Expand All @@ -39,12 +38,11 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v41, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 2
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
Expand Down
1,233 changes: 667 additions & 566 deletions llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

Large diffs are not rendered by default.

816 changes: 816 additions & 0 deletions llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

Large diffs are not rendered by default.

391 changes: 375 additions & 16 deletions llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: GCN NSA Reassign
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: Stack Slot Coloring

; O0: Fast Register Allocator
; O0-NEXT: SI lower SGPR spill instructions
; O0-NEXT: Fast Register Allocator
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: SI Fix VGPR copies


Expand All @@ -49,6 +51,7 @@
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
Expand All @@ -61,6 +64,7 @@
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
Expand All @@ -79,6 +83,7 @@
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: GCN NSA Reassign
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: Stack Slot Coloring
Expand Down
37 changes: 29 additions & 8 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s

# After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0.
# Otherwise, the test would crash during PEI while trying to replace the dead frame index.
Expand Down Expand Up @@ -39,13 +40,33 @@ machineFunctionInfo:
workGroupIDX: { reg: '$sgpr8' }
privateSegmentWaveByteOffset: { reg: '$sgpr9' }
body: |
; CHECK-LABEL: name: test
; CHECK: bb.0:
; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0
; CHECK: DBG_VALUE $noreg, 0
; CHECK: bb.1:
; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0
; CHECK: S_ENDPGM 0
; SGPR_SPILL-LABEL: name: test
; SGPR_SPILL: bb.0:
; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000)
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: bb.1:
; SGPR_SPILL-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: S_ENDPGM 0
; PEI-LABEL: name: test
; PEI: bb.0:
; PEI-NEXT: successors: %bb.1(0x80000000)
; PEI-NEXT: {{ $}}
; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; PEI-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: bb.1:
; PEI-NEXT: liveins: $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 0
; PEI-NEXT: KILL killed renamable $vgpr0
; PEI-NEXT: S_ENDPGM 0
bb.0:
renamable $sgpr10 = IMPLICIT_DEF
SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s

# After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0.
# Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise.
Expand Down Expand Up @@ -45,7 +45,7 @@ machineFunctionInfo:
body: |
; CHECK-LABEL: name: test
; CHECK: bb.0:
; CHECK: DBG_VALUE $noreg, 0
; CHECK: DBG_VALUE
bb.0:
renamable $sgpr10 = IMPLICIT_DEF
SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
Expand Down
300 changes: 158 additions & 142 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,17 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 s0, s0, s15
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: ; implicit-def: $vgpr0
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
Expand All @@ -23,186 +31,194 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v23, s8, 0
; GCN-NEXT: v_writelane_b32 v23, s9, 1
; GCN-NEXT: v_writelane_b32 v23, s10, 2
; GCN-NEXT: v_writelane_b32 v23, s11, 3
; GCN-NEXT: v_writelane_b32 v23, s12, 4
; GCN-NEXT: v_writelane_b32 v23, s13, 5
; GCN-NEXT: v_writelane_b32 v23, s14, 6
; GCN-NEXT: v_writelane_b32 v23, s15, 7
; GCN-NEXT: v_writelane_b32 v23, s16, 8
; GCN-NEXT: v_writelane_b32 v23, s17, 9
; GCN-NEXT: v_writelane_b32 v23, s18, 10
; GCN-NEXT: v_writelane_b32 v23, s19, 11
; GCN-NEXT: v_writelane_b32 v23, s20, 12
; GCN-NEXT: v_writelane_b32 v23, s21, 13
; GCN-NEXT: v_writelane_b32 v23, s22, 14
; GCN-NEXT: v_writelane_b32 v23, s23, 15
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_writelane_b32 v1, s8, 0
; GCN-NEXT: v_writelane_b32 v1, s9, 1
; GCN-NEXT: v_writelane_b32 v1, s10, 2
; GCN-NEXT: v_writelane_b32 v1, s11, 3
; GCN-NEXT: v_writelane_b32 v1, s12, 4
; GCN-NEXT: v_writelane_b32 v1, s13, 5
; GCN-NEXT: v_writelane_b32 v1, s14, 6
; GCN-NEXT: v_writelane_b32 v1, s15, 7
; GCN-NEXT: v_writelane_b32 v1, s16, 8
; GCN-NEXT: v_writelane_b32 v1, s17, 9
; GCN-NEXT: v_writelane_b32 v1, s18, 10
; GCN-NEXT: v_writelane_b32 v1, s19, 11
; GCN-NEXT: v_writelane_b32 v1, s20, 12
; GCN-NEXT: v_writelane_b32 v1, s21, 13
; GCN-NEXT: v_writelane_b32 v1, s22, 14
; GCN-NEXT: v_writelane_b32 v1, s23, 15
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v23, s8, 16
; GCN-NEXT: v_writelane_b32 v23, s9, 17
; GCN-NEXT: v_writelane_b32 v23, s10, 18
; GCN-NEXT: v_writelane_b32 v23, s11, 19
; GCN-NEXT: v_writelane_b32 v23, s12, 20
; GCN-NEXT: v_writelane_b32 v23, s13, 21
; GCN-NEXT: v_writelane_b32 v23, s14, 22
; GCN-NEXT: v_writelane_b32 v23, s15, 23
; GCN-NEXT: v_writelane_b32 v23, s16, 24
; GCN-NEXT: v_writelane_b32 v23, s17, 25
; GCN-NEXT: v_writelane_b32 v23, s18, 26
; GCN-NEXT: v_writelane_b32 v23, s19, 27
; GCN-NEXT: v_writelane_b32 v23, s20, 28
; GCN-NEXT: v_writelane_b32 v23, s21, 29
; GCN-NEXT: v_writelane_b32 v23, s22, 30
; GCN-NEXT: v_writelane_b32 v23, s23, 31
; GCN-NEXT: v_writelane_b32 v1, s8, 16
; GCN-NEXT: v_writelane_b32 v1, s9, 17
; GCN-NEXT: v_writelane_b32 v1, s10, 18
; GCN-NEXT: v_writelane_b32 v1, s11, 19
; GCN-NEXT: v_writelane_b32 v1, s12, 20
; GCN-NEXT: v_writelane_b32 v1, s13, 21
; GCN-NEXT: v_writelane_b32 v1, s14, 22
; GCN-NEXT: v_writelane_b32 v1, s15, 23
; GCN-NEXT: v_writelane_b32 v1, s16, 24
; GCN-NEXT: v_writelane_b32 v1, s17, 25
; GCN-NEXT: v_writelane_b32 v1, s18, 26
; GCN-NEXT: v_writelane_b32 v1, s19, 27
; GCN-NEXT: v_writelane_b32 v1, s20, 28
; GCN-NEXT: v_writelane_b32 v1, s21, 29
; GCN-NEXT: v_writelane_b32 v1, s22, 30
; GCN-NEXT: v_writelane_b32 v1, s23, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v23, s8, 32
; GCN-NEXT: v_writelane_b32 v23, s9, 33
; GCN-NEXT: v_writelane_b32 v23, s10, 34
; GCN-NEXT: v_writelane_b32 v23, s11, 35
; GCN-NEXT: v_writelane_b32 v23, s12, 36
; GCN-NEXT: v_writelane_b32 v23, s13, 37
; GCN-NEXT: v_writelane_b32 v23, s14, 38
; GCN-NEXT: v_writelane_b32 v23, s15, 39
; GCN-NEXT: v_writelane_b32 v23, s16, 40
; GCN-NEXT: v_writelane_b32 v23, s17, 41
; GCN-NEXT: v_writelane_b32 v23, s18, 42
; GCN-NEXT: v_writelane_b32 v23, s19, 43
; GCN-NEXT: v_writelane_b32 v23, s20, 44
; GCN-NEXT: v_writelane_b32 v23, s21, 45
; GCN-NEXT: v_writelane_b32 v23, s22, 46
; GCN-NEXT: v_writelane_b32 v23, s23, 47
; GCN-NEXT: v_writelane_b32 v1, s8, 32
; GCN-NEXT: v_writelane_b32 v1, s9, 33
; GCN-NEXT: v_writelane_b32 v1, s10, 34
; GCN-NEXT: v_writelane_b32 v1, s11, 35
; GCN-NEXT: v_writelane_b32 v1, s12, 36
; GCN-NEXT: v_writelane_b32 v1, s13, 37
; GCN-NEXT: v_writelane_b32 v1, s14, 38
; GCN-NEXT: v_writelane_b32 v1, s15, 39
; GCN-NEXT: v_writelane_b32 v1, s16, 40
; GCN-NEXT: v_writelane_b32 v1, s17, 41
; GCN-NEXT: v_writelane_b32 v1, s18, 42
; GCN-NEXT: v_writelane_b32 v1, s19, 43
; GCN-NEXT: v_writelane_b32 v1, s20, 44
; GCN-NEXT: v_writelane_b32 v1, s21, 45
; GCN-NEXT: v_writelane_b32 v1, s22, 46
; GCN-NEXT: v_writelane_b32 v1, s23, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v23, s8, 48
; GCN-NEXT: v_writelane_b32 v23, s9, 49
; GCN-NEXT: v_writelane_b32 v23, s10, 50
; GCN-NEXT: v_writelane_b32 v23, s11, 51
; GCN-NEXT: v_writelane_b32 v23, s12, 52
; GCN-NEXT: v_writelane_b32 v23, s13, 53
; GCN-NEXT: v_writelane_b32 v23, s14, 54
; GCN-NEXT: v_writelane_b32 v23, s15, 55
; GCN-NEXT: v_writelane_b32 v23, s16, 56
; GCN-NEXT: v_writelane_b32 v23, s17, 57
; GCN-NEXT: v_writelane_b32 v23, s18, 58
; GCN-NEXT: v_writelane_b32 v23, s19, 59
; GCN-NEXT: v_writelane_b32 v23, s20, 60
; GCN-NEXT: v_writelane_b32 v23, s21, 61
; GCN-NEXT: v_writelane_b32 v23, s22, 62
; GCN-NEXT: v_writelane_b32 v23, s23, 63
; GCN-NEXT: v_writelane_b32 v1, s8, 48
; GCN-NEXT: v_writelane_b32 v1, s9, 49
; GCN-NEXT: v_writelane_b32 v1, s10, 50
; GCN-NEXT: v_writelane_b32 v1, s11, 51
; GCN-NEXT: v_writelane_b32 v1, s12, 52
; GCN-NEXT: v_writelane_b32 v1, s13, 53
; GCN-NEXT: v_writelane_b32 v1, s14, 54
; GCN-NEXT: v_writelane_b32 v1, s15, 55
; GCN-NEXT: v_writelane_b32 v1, s16, 56
; GCN-NEXT: v_writelane_b32 v1, s17, 57
; GCN-NEXT: v_writelane_b32 v1, s18, 58
; GCN-NEXT: v_writelane_b32 v1, s19, 59
; GCN-NEXT: v_writelane_b32 v1, s20, 60
; GCN-NEXT: v_writelane_b32 v1, s21, 61
; GCN-NEXT: v_writelane_b32 v1, s22, 62
; GCN-NEXT: v_writelane_b32 v1, s23, 63
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[6:7]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_mov_b64 s[8:9], exec
; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_writelane_b32 v0, s6, 0
; GCN-NEXT: v_writelane_b32 v0, s7, 1
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[8:9]
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, s5
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: v_readlane_b32 s4, v23, 0
; GCN-NEXT: v_readlane_b32 s5, v23, 1
; GCN-NEXT: v_readlane_b32 s6, v23, 2
; GCN-NEXT: v_readlane_b32 s7, v23, 3
; GCN-NEXT: v_readlane_b32 s8, v23, 4
; GCN-NEXT: v_readlane_b32 s9, v23, 5
; GCN-NEXT: v_readlane_b32 s10, v23, 6
; GCN-NEXT: v_readlane_b32 s11, v23, 7
; GCN-NEXT: v_readlane_b32 s12, v23, 8
; GCN-NEXT: v_readlane_b32 s13, v23, 9
; GCN-NEXT: v_readlane_b32 s14, v23, 10
; GCN-NEXT: v_readlane_b32 s15, v23, 11
; GCN-NEXT: v_readlane_b32 s16, v23, 12
; GCN-NEXT: v_readlane_b32 s17, v23, 13
; GCN-NEXT: v_readlane_b32 s18, v23, 14
; GCN-NEXT: v_readlane_b32 s19, v23, 15
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v1, 0
; GCN-NEXT: v_readlane_b32 s5, v1, 1
; GCN-NEXT: v_readlane_b32 s6, v1, 2
; GCN-NEXT: v_readlane_b32 s7, v1, 3
; GCN-NEXT: v_readlane_b32 s8, v1, 4
; GCN-NEXT: v_readlane_b32 s9, v1, 5
; GCN-NEXT: v_readlane_b32 s10, v1, 6
; GCN-NEXT: v_readlane_b32 s11, v1, 7
; GCN-NEXT: v_readlane_b32 s12, v1, 8
; GCN-NEXT: v_readlane_b32 s13, v1, 9
; GCN-NEXT: v_readlane_b32 s14, v1, 10
; GCN-NEXT: v_readlane_b32 s15, v1, 11
; GCN-NEXT: v_readlane_b32 s16, v1, 12
; GCN-NEXT: v_readlane_b32 s17, v1, 13
; GCN-NEXT: v_readlane_b32 s18, v1, 14
; GCN-NEXT: v_readlane_b32 s19, v1, 15
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v23, 16
; GCN-NEXT: v_readlane_b32 s5, v23, 17
; GCN-NEXT: v_readlane_b32 s6, v23, 18
; GCN-NEXT: v_readlane_b32 s7, v23, 19
; GCN-NEXT: v_readlane_b32 s8, v23, 20
; GCN-NEXT: v_readlane_b32 s9, v23, 21
; GCN-NEXT: v_readlane_b32 s10, v23, 22
; GCN-NEXT: v_readlane_b32 s11, v23, 23
; GCN-NEXT: v_readlane_b32 s12, v23, 24
; GCN-NEXT: v_readlane_b32 s13, v23, 25
; GCN-NEXT: v_readlane_b32 s14, v23, 26
; GCN-NEXT: v_readlane_b32 s15, v23, 27
; GCN-NEXT: v_readlane_b32 s16, v23, 28
; GCN-NEXT: v_readlane_b32 s17, v23, 29
; GCN-NEXT: v_readlane_b32 s18, v23, 30
; GCN-NEXT: v_readlane_b32 s19, v23, 31
; GCN-NEXT: v_readlane_b32 s4, v1, 16
; GCN-NEXT: v_readlane_b32 s5, v1, 17
; GCN-NEXT: v_readlane_b32 s6, v1, 18
; GCN-NEXT: v_readlane_b32 s7, v1, 19
; GCN-NEXT: v_readlane_b32 s8, v1, 20
; GCN-NEXT: v_readlane_b32 s9, v1, 21
; GCN-NEXT: v_readlane_b32 s10, v1, 22
; GCN-NEXT: v_readlane_b32 s11, v1, 23
; GCN-NEXT: v_readlane_b32 s12, v1, 24
; GCN-NEXT: v_readlane_b32 s13, v1, 25
; GCN-NEXT: v_readlane_b32 s14, v1, 26
; GCN-NEXT: v_readlane_b32 s15, v1, 27
; GCN-NEXT: v_readlane_b32 s16, v1, 28
; GCN-NEXT: v_readlane_b32 s17, v1, 29
; GCN-NEXT: v_readlane_b32 s18, v1, 30
; GCN-NEXT: v_readlane_b32 s19, v1, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v23, 32
; GCN-NEXT: v_readlane_b32 s5, v23, 33
; GCN-NEXT: v_readlane_b32 s6, v23, 34
; GCN-NEXT: v_readlane_b32 s7, v23, 35
; GCN-NEXT: v_readlane_b32 s8, v23, 36
; GCN-NEXT: v_readlane_b32 s9, v23, 37
; GCN-NEXT: v_readlane_b32 s10, v23, 38
; GCN-NEXT: v_readlane_b32 s11, v23, 39
; GCN-NEXT: v_readlane_b32 s12, v23, 40
; GCN-NEXT: v_readlane_b32 s13, v23, 41
; GCN-NEXT: v_readlane_b32 s14, v23, 42
; GCN-NEXT: v_readlane_b32 s15, v23, 43
; GCN-NEXT: v_readlane_b32 s16, v23, 44
; GCN-NEXT: v_readlane_b32 s17, v23, 45
; GCN-NEXT: v_readlane_b32 s18, v23, 46
; GCN-NEXT: v_readlane_b32 s19, v23, 47
; GCN-NEXT: v_readlane_b32 s4, v1, 32
; GCN-NEXT: v_readlane_b32 s5, v1, 33
; GCN-NEXT: v_readlane_b32 s6, v1, 34
; GCN-NEXT: v_readlane_b32 s7, v1, 35
; GCN-NEXT: v_readlane_b32 s8, v1, 36
; GCN-NEXT: v_readlane_b32 s9, v1, 37
; GCN-NEXT: v_readlane_b32 s10, v1, 38
; GCN-NEXT: v_readlane_b32 s11, v1, 39
; GCN-NEXT: v_readlane_b32 s12, v1, 40
; GCN-NEXT: v_readlane_b32 s13, v1, 41
; GCN-NEXT: v_readlane_b32 s14, v1, 42
; GCN-NEXT: v_readlane_b32 s15, v1, 43
; GCN-NEXT: v_readlane_b32 s16, v1, 44
; GCN-NEXT: v_readlane_b32 s17, v1, 45
; GCN-NEXT: v_readlane_b32 s18, v1, 46
; GCN-NEXT: v_readlane_b32 s19, v1, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s8, v23, 48
; GCN-NEXT: v_readlane_b32 s9, v23, 49
; GCN-NEXT: v_readlane_b32 s10, v23, 50
; GCN-NEXT: v_readlane_b32 s11, v23, 51
; GCN-NEXT: v_readlane_b32 s12, v23, 52
; GCN-NEXT: v_readlane_b32 s13, v23, 53
; GCN-NEXT: v_readlane_b32 s14, v23, 54
; GCN-NEXT: v_readlane_b32 s15, v23, 55
; GCN-NEXT: v_readlane_b32 s16, v23, 56
; GCN-NEXT: v_readlane_b32 s17, v23, 57
; GCN-NEXT: v_readlane_b32 s18, v23, 58
; GCN-NEXT: v_readlane_b32 s19, v23, 59
; GCN-NEXT: v_readlane_b32 s20, v23, 60
; GCN-NEXT: v_readlane_b32 s21, v23, 61
; GCN-NEXT: v_readlane_b32 s22, v23, 62
; GCN-NEXT: v_readlane_b32 s23, v23, 63
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_mov_b64 exec, 3
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s8, v1, 48
; GCN-NEXT: v_readlane_b32 s9, v1, 49
; GCN-NEXT: v_readlane_b32 s10, v1, 50
; GCN-NEXT: v_readlane_b32 s11, v1, 51
; GCN-NEXT: v_readlane_b32 s12, v1, 52
; GCN-NEXT: v_readlane_b32 s13, v1, 53
; GCN-NEXT: v_readlane_b32 s14, v1, 54
; GCN-NEXT: v_readlane_b32 s15, v1, 55
; GCN-NEXT: v_readlane_b32 s16, v1, 56
; GCN-NEXT: v_readlane_b32 s17, v1, 57
; GCN-NEXT: v_readlane_b32 s18, v1, 58
; GCN-NEXT: v_readlane_b32 s19, v1, 59
; GCN-NEXT: v_readlane_b32 s20, v1, 60
; GCN-NEXT: v_readlane_b32 s21, v1, 61
; GCN-NEXT: v_readlane_b32 s22, v1, 62
; GCN-NEXT: v_readlane_b32 s23, v1, 63
; GCN-NEXT: v_readlane_b32 s4, v0, 0
; GCN-NEXT: v_readlane_b32 s5, v0, 1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:5]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB0_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
; GCN-NEXT: s_endpgm
call void asm sideeffect "", "~{v[0:7]}" () #0
call void asm sideeffect "", "~{v[8:15]}" () #0
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ body: |
liveins: $sgpr4
; CHECK-LABEL: name: sgpr_spill_s64_undef_high32
; CHECK: liveins: $sgpr4, $vgpr0
; CHECK: liveins: $sgpr4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
...
Expand All @@ -45,10 +46,11 @@ body: |
liveins: $sgpr5
; CHECK-LABEL: name: sgpr_spill_s64_undef_low32
; CHECK: liveins: $sgpr5, $vgpr0
; CHECK: liveins: $sgpr5
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
...
Loading