522 changes: 260 additions & 262 deletions llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/illegal-eviction-assert.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,1 -o - 2>%t.err %s | FileCheck %s
# RUN: not llc -mtriple=amdgcn -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,2 -o - 2>%t.err %s | FileCheck %s
# RUN: FileCheck -check-prefix=ERR %s < %t.err

# This testcase cannot be compiled. An attempted eviction legality
Expand Down
2,482 changes: 1,219 additions & 1,263 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Large diffs are not rendered by default.

14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ define fastcc i32 @foo() {
; CHECK-LABEL: name: foo
; CHECK: bb.0 (%ir-block.0):
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $vgpr40, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAITCNT 0
; CHECK-NEXT: $sgpr16 = S_MOV_B32 $sgpr33
; CHECK-NEXT: $sgpr33 = S_MOV_B32 $sgpr32
; CHECK-NEXT: $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr17
; CHECK-NEXT: $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
Expand All @@ -26,24 +26,22 @@ define fastcc i32 @foo() {
; CHECK-NEXT: BUFFER_GL1_INV implicit $exec
; CHECK-NEXT: BUFFER_GL0_INV implicit $exec
; CHECK-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
; CHECK-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
; CHECK-NEXT: S_WAITCNT 49279
; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
; CHECK-NEXT: $vcc_lo = S_MOV_B32 $exec_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1 (%ir-block.1):
; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; CHECK-NEXT: liveins: $vcc_lo, $vgpr40
; CHECK-NEXT: liveins: $vcc_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.DummyReturnBlock:
; CHECK-NEXT: liveins: $vgpr40
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $sgpr31 = V_READLANE_B32 $vgpr40, 1
; CHECK-NEXT: $sgpr30 = V_READLANE_B32 $vgpr40, 0
; CHECK-NEXT: $sgpr4 = V_READLANE_B32 killed $vgpr40, 2
; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr40, 2
; CHECK-NEXT: $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr5
Expand Down
153 changes: 76 additions & 77 deletions llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1010,73 +1010,73 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GCN-NEXT: s_load_dword s8, s[2:3], 0x44
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s9, s4, 0xf0001
; GCN-NEXT: s_lshr_b32 s42, s5, 16
; GCN-NEXT: v_writelane_b32 v0, s0, 0
; GCN-NEXT: v_writelane_b32 v0, s1, 1
; GCN-NEXT: v_writelane_b32 v6, s0, 0
; GCN-NEXT: v_writelane_b32 v6, s1, 1
; GCN-NEXT: s_lshr_b32 s0, s4, 16
; GCN-NEXT: v_writelane_b32 v0, s0, 2
; GCN-NEXT: v_writelane_b32 v6, s0, 2
; GCN-NEXT: s_lshr_b32 s0, s4, 17
; GCN-NEXT: v_writelane_b32 v0, s0, 3
; GCN-NEXT: v_writelane_b32 v6, s0, 3
; GCN-NEXT: s_lshr_b32 s0, s4, 18
; GCN-NEXT: v_writelane_b32 v0, s0, 4
; GCN-NEXT: v_writelane_b32 v6, s0, 4
; GCN-NEXT: s_lshr_b32 s0, s4, 19
; GCN-NEXT: v_writelane_b32 v0, s0, 5
; GCN-NEXT: v_writelane_b32 v6, s0, 5
; GCN-NEXT: s_lshr_b32 s0, s4, 20
; GCN-NEXT: v_writelane_b32 v0, s0, 6
; GCN-NEXT: v_writelane_b32 v6, s0, 6
; GCN-NEXT: s_lshr_b32 s0, s4, 21
; GCN-NEXT: v_writelane_b32 v0, s0, 7
; GCN-NEXT: v_writelane_b32 v6, s0, 7
; GCN-NEXT: s_lshr_b32 s0, s4, 22
; GCN-NEXT: v_writelane_b32 v0, s0, 8
; GCN-NEXT: v_writelane_b32 v6, s0, 8
; GCN-NEXT: s_lshr_b32 s0, s4, 23
; GCN-NEXT: v_writelane_b32 v0, s0, 9
; GCN-NEXT: v_writelane_b32 v6, s0, 9
; GCN-NEXT: s_lshr_b32 s0, s4, 24
; GCN-NEXT: v_writelane_b32 v0, s0, 10
; GCN-NEXT: v_writelane_b32 v6, s0, 10
; GCN-NEXT: s_lshr_b32 s0, s4, 25
; GCN-NEXT: v_writelane_b32 v0, s0, 11
; GCN-NEXT: v_writelane_b32 v6, s0, 11
; GCN-NEXT: s_lshr_b32 s0, s4, 26
; GCN-NEXT: v_writelane_b32 v0, s0, 12
; GCN-NEXT: v_writelane_b32 v6, s0, 12
; GCN-NEXT: s_lshr_b32 s0, s4, 27
; GCN-NEXT: v_writelane_b32 v0, s0, 13
; GCN-NEXT: v_writelane_b32 v6, s0, 13
; GCN-NEXT: s_lshr_b32 s0, s4, 28
; GCN-NEXT: v_writelane_b32 v0, s0, 14
; GCN-NEXT: v_writelane_b32 v6, s0, 14
; GCN-NEXT: s_lshr_b32 s0, s4, 29
; GCN-NEXT: v_writelane_b32 v0, s0, 15
; GCN-NEXT: v_writelane_b32 v6, s0, 15
; GCN-NEXT: s_lshr_b32 s0, s4, 30
; GCN-NEXT: v_writelane_b32 v0, s0, 16
; GCN-NEXT: v_writelane_b32 v6, s0, 16
; GCN-NEXT: s_lshr_b32 s0, s4, 31
; GCN-NEXT: v_writelane_b32 v0, s0, 17
; GCN-NEXT: v_writelane_b32 v0, s9, 18
; GCN-NEXT: v_writelane_b32 v6, s0, 17
; GCN-NEXT: v_writelane_b32 v6, s9, 18
; GCN-NEXT: s_bfe_u32 s9, s4, 0xe0002
; GCN-NEXT: v_writelane_b32 v0, s9, 19
; GCN-NEXT: v_writelane_b32 v6, s9, 19
; GCN-NEXT: s_bfe_u32 s9, s4, 0xd0003
; GCN-NEXT: v_writelane_b32 v0, s9, 20
; GCN-NEXT: v_writelane_b32 v6, s9, 20
; GCN-NEXT: s_bfe_u32 s9, s4, 0xc0004
; GCN-NEXT: v_writelane_b32 v0, s9, 21
; GCN-NEXT: v_writelane_b32 v6, s9, 21
; GCN-NEXT: s_bfe_u32 s9, s4, 0xb0005
; GCN-NEXT: v_writelane_b32 v0, s9, 22
; GCN-NEXT: v_writelane_b32 v6, s9, 22
; GCN-NEXT: s_bfe_u32 s9, s4, 0xa0006
; GCN-NEXT: v_writelane_b32 v0, s9, 23
; GCN-NEXT: v_writelane_b32 v6, s9, 23
; GCN-NEXT: s_bfe_u32 s9, s4, 0x90007
; GCN-NEXT: v_writelane_b32 v0, s9, 24
; GCN-NEXT: v_writelane_b32 v6, s9, 24
; GCN-NEXT: s_bfe_u32 s9, s4, 0x80008
; GCN-NEXT: v_writelane_b32 v0, s9, 25
; GCN-NEXT: v_writelane_b32 v6, s9, 25
; GCN-NEXT: s_bfe_u32 s9, s4, 0x70009
; GCN-NEXT: v_writelane_b32 v0, s9, 26
; GCN-NEXT: v_writelane_b32 v6, s9, 26
; GCN-NEXT: s_bfe_u32 s9, s4, 0x6000a
; GCN-NEXT: v_writelane_b32 v0, s9, 27
; GCN-NEXT: v_writelane_b32 v6, s9, 27
; GCN-NEXT: s_bfe_u32 s9, s4, 0x5000b
; GCN-NEXT: v_writelane_b32 v0, s9, 28
; GCN-NEXT: v_writelane_b32 v6, s9, 28
; GCN-NEXT: s_bfe_u32 s9, s4, 0x4000c
; GCN-NEXT: v_writelane_b32 v0, s9, 29
; GCN-NEXT: v_writelane_b32 v6, s9, 29
; GCN-NEXT: s_bfe_u32 s9, s4, 0x3000d
; GCN-NEXT: v_writelane_b32 v0, s9, 30
; GCN-NEXT: v_writelane_b32 v6, s9, 30
; GCN-NEXT: s_bfe_u32 s9, s4, 0x2000e
; GCN-NEXT: v_writelane_b32 v0, s9, 31
; GCN-NEXT: v_writelane_b32 v6, s9, 31
; GCN-NEXT: s_bfe_u32 s9, s4, 0x1000f
; GCN-NEXT: v_writelane_b32 v0, s9, 32
; GCN-NEXT: v_writelane_b32 v6, s9, 32
; GCN-NEXT: s_bfe_u32 s9, s5, 0xf0001
; GCN-NEXT: s_lshr_b32 s43, s5, 17
; GCN-NEXT: s_lshr_b32 s45, s5, 18
Expand Down Expand Up @@ -1125,7 +1125,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_lshr_b32 s2, s7, 29
; GCN-NEXT: s_lshr_b32 s1, s7, 30
; GCN-NEXT: s_lshr_b32 s0, s7, 31
; GCN-NEXT: v_writelane_b32 v0, s9, 33
; GCN-NEXT: v_writelane_b32 v6, s9, 33
; GCN-NEXT: s_bfe_u32 s40, s5, 0xe0002
; GCN-NEXT: s_bfe_u32 s41, s5, 0xd0003
; GCN-NEXT: s_bfe_u32 s44, s5, 0xc0004
Expand Down Expand Up @@ -1630,7 +1630,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 33
; GCN-NEXT: v_readlane_b32 s9, v0, 33
; GCN-NEXT: v_readlane_b32 s9, v6, 33
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s5, s5, s9
Expand All @@ -1643,43 +1643,43 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_and_b32 s1, s1, 0xffff
; GCN-NEXT: s_or_b32 s0, s1, s0
; GCN-NEXT: s_cmp_lg_u32 s8, 31
; GCN-NEXT: v_readlane_b32 s1, v0, 17
; GCN-NEXT: v_readlane_b32 s1, v6, 17
; GCN-NEXT: s_cselect_b32 s1, s1, 1
; GCN-NEXT: s_lshl_b32 s1, s1, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 30
; GCN-NEXT: v_readlane_b32 s2, v0, 16
; GCN-NEXT: v_readlane_b32 s2, v6, 16
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_lshl_b32 s2, s2, 2
; GCN-NEXT: s_or_b32 s1, s1, s2
; GCN-NEXT: s_cmp_lg_u32 s8, 29
; GCN-NEXT: v_readlane_b32 s2, v0, 15
; GCN-NEXT: v_readlane_b32 s2, v6, 15
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_lshl_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 28
; GCN-NEXT: v_readlane_b32 s3, v0, 14
; GCN-NEXT: v_readlane_b32 s3, v6, 14
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 1
; GCN-NEXT: s_or_b32 s2, s3, s2
; GCN-NEXT: s_and_b32 s2, s2, 3
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: s_lshl_b32 s1, s1, 12
; GCN-NEXT: s_cmp_lg_u32 s8, 27
; GCN-NEXT: v_readlane_b32 s2, v0, 13
; GCN-NEXT: v_readlane_b32 s2, v6, 13
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_lshl_b32 s2, s2, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 26
; GCN-NEXT: v_readlane_b32 s3, v0, 12
; GCN-NEXT: v_readlane_b32 s3, v6, 12
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 2
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_cmp_lg_u32 s8, 25
; GCN-NEXT: v_readlane_b32 s3, v0, 11
; GCN-NEXT: v_readlane_b32 s3, v6, 11
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 24
; GCN-NEXT: v_readlane_b32 s5, v0, 10
; GCN-NEXT: v_readlane_b32 s5, v6, 10
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_or_b32 s3, s5, s3
Expand All @@ -1689,43 +1689,43 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_lshl_b32 s2, s2, 8
; GCN-NEXT: s_or_b32 s1, s1, s2
; GCN-NEXT: s_cmp_lg_u32 s8, 23
; GCN-NEXT: v_readlane_b32 s2, v0, 9
; GCN-NEXT: v_readlane_b32 s2, v6, 9
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_lshl_b32 s2, s2, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 22
; GCN-NEXT: v_readlane_b32 s3, v0, 8
; GCN-NEXT: v_readlane_b32 s3, v6, 8
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 2
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_cmp_lg_u32 s8, 21
; GCN-NEXT: v_readlane_b32 s3, v0, 7
; GCN-NEXT: v_readlane_b32 s3, v6, 7
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 20
; GCN-NEXT: v_readlane_b32 s5, v0, 6
; GCN-NEXT: v_readlane_b32 s5, v6, 6
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_or_b32 s3, s5, s3
; GCN-NEXT: s_and_b32 s3, s3, 3
; GCN-NEXT: s_or_b32 s2, s3, s2
; GCN-NEXT: s_lshl_b32 s2, s2, 4
; GCN-NEXT: s_cmp_lg_u32 s8, 19
; GCN-NEXT: v_readlane_b32 s3, v0, 5
; GCN-NEXT: v_readlane_b32 s3, v6, 5
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 18
; GCN-NEXT: v_readlane_b32 s5, v0, 4
; GCN-NEXT: v_readlane_b32 s5, v6, 4
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s3, s3, s5
; GCN-NEXT: s_cmp_lg_u32 s8, 17
; GCN-NEXT: v_readlane_b32 s5, v0, 3
; GCN-NEXT: v_readlane_b32 s5, v6, 3
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 16
; GCN-NEXT: v_readlane_b32 s9, v0, 2
; GCN-NEXT: v_readlane_b32 s9, v6, 2
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s5, s9, s5
Expand All @@ -1737,43 +1737,43 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_cmp_lg_u32 s8, 15
; GCN-NEXT: v_readlane_b32 s2, v0, 32
; GCN-NEXT: v_readlane_b32 s2, v6, 32
; GCN-NEXT: s_cselect_b32 s2, s2, 1
; GCN-NEXT: s_lshl_b32 s2, s2, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 14
; GCN-NEXT: v_readlane_b32 s3, v0, 31
; GCN-NEXT: v_readlane_b32 s3, v6, 31
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_and_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 2
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_cmp_lg_u32 s8, 13
; GCN-NEXT: v_readlane_b32 s3, v0, 30
; GCN-NEXT: v_readlane_b32 s3, v6, 30
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 12
; GCN-NEXT: v_readlane_b32 s5, v0, 29
; GCN-NEXT: v_readlane_b32 s5, v6, 29
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_or_b32 s3, s5, s3
; GCN-NEXT: s_and_b32 s3, s3, 3
; GCN-NEXT: s_or_b32 s2, s3, s2
; GCN-NEXT: s_lshl_b32 s2, s2, 12
; GCN-NEXT: s_cmp_lg_u32 s8, 11
; GCN-NEXT: v_readlane_b32 s3, v0, 28
; GCN-NEXT: v_readlane_b32 s3, v6, 28
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 10
; GCN-NEXT: v_readlane_b32 s5, v0, 27
; GCN-NEXT: v_readlane_b32 s5, v6, 27
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s3, s3, s5
; GCN-NEXT: s_cmp_lg_u32 s8, 9
; GCN-NEXT: v_readlane_b32 s5, v0, 26
; GCN-NEXT: v_readlane_b32 s5, v6, 26
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 8
; GCN-NEXT: v_readlane_b32 s9, v0, 25
; GCN-NEXT: v_readlane_b32 s9, v6, 25
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s5, s9, s5
Expand All @@ -1783,33 +1783,33 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_lshl_b32 s3, s3, 8
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_cmp_lg_u32 s8, 7
; GCN-NEXT: v_readlane_b32 s3, v0, 24
; GCN-NEXT: v_readlane_b32 s3, v6, 24
; GCN-NEXT: s_cselect_b32 s3, s3, 1
; GCN-NEXT: s_lshl_b32 s3, s3, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_readlane_b32 s5, v0, 23
; GCN-NEXT: v_readlane_b32 s5, v6, 23
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_and_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 2
; GCN-NEXT: s_or_b32 s3, s3, s5
; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_readlane_b32 s5, v0, 22
; GCN-NEXT: v_readlane_b32 s5, v6, 22
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_readlane_b32 s9, v0, 21
; GCN-NEXT: v_readlane_b32 s9, v6, 21
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_or_b32 s5, s9, s5
; GCN-NEXT: s_and_b32 s5, s5, 3
; GCN-NEXT: s_or_b32 s3, s5, s3
; GCN-NEXT: s_lshl_b32 s3, s3, 4
; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_readlane_b32 s5, v0, 20
; GCN-NEXT: v_readlane_b32 s5, v6, 20
; GCN-NEXT: s_cselect_b32 s5, s5, 1
; GCN-NEXT: s_lshl_b32 s5, s5, 3
; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_readlane_b32 s9, v0, 19
; GCN-NEXT: v_readlane_b32 s9, v6, 19
; GCN-NEXT: s_cselect_b32 s9, s9, 1
; GCN-NEXT: s_and_b32 s9, s9, 1
; GCN-NEXT: s_lshl_b32 s9, s9, 2
Expand All @@ -1818,7 +1818,7 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_cselect_b32 s4, s4, 1
; GCN-NEXT: s_and_b32 s4, s4, 1
; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_readlane_b32 s8, v0, 18
; GCN-NEXT: v_readlane_b32 s8, v6, 18
; GCN-NEXT: s_cselect_b32 s8, s8, 1
; GCN-NEXT: s_lshl_b32 s8, s8, 1
; GCN-NEXT: s_or_b32 s4, s4, s8
Expand All @@ -1830,16 +1830,15 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
; GCN-NEXT: s_or_b32 s2, s3, s2
; GCN-NEXT: s_and_b32 s2, s2, 0xffff
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_readlane_b32 s0, v0, 0
; GCN-NEXT: v_readlane_b32 s1, v0, 1
; GCN-NEXT: v_mov_b32_e32 v6, s1
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v4, s7
; GCN-NEXT: v_mov_b32_e32 v5, s0
; GCN-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
; GCN-NEXT: ; kill: killed $vgpr0
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_readlane_b32 s0, v6, 0
; GCN-NEXT: v_readlane_b32 s1, v6, 1
; GCN-NEXT: v_mov_b32_e32 v5, s1
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <128 x i1> %vec, i1 1, i32 %sel
Expand Down
37 changes: 7 additions & 30 deletions llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[10:11], s[8:9]
; CHECK-NEXT: v_mov_b32_e32 v3, v2
; CHECK-NEXT: v_mov_b32_e32 v2, v1
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_load_dword s8, s[6:7], 0x0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v0, s8, 0
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s8, s33, 0x100200
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: ; implicit-def: $vgpr40 : SGPR spill to VGPR lane
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v40, s8, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def vgpr10
; CHECK-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -62,39 +54,24 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_waitcnt vmcnt(1)
; CHECK-NEXT: v_readlane_b32 s4, v0, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 0
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %store
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: s_add_i32 s4, s33, 0x100000
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ds_write_b32 v1, v2
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: ds_write_b32 v0, v1
; CHECK-NEXT: s_endpgm
; CHECK-NEXT: .LBB0_2: ; %end
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
; CHECK-NEXT: s_add_i32 s4, s33, 0x100200
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[34:35]
; CHECK-NEXT: ; kill: killed $vgpr0
; CHECK-NEXT: s_endpgm
%arr = alloca < 1339 x i32>, align 8192, addrspace(5)
%cmp = icmp ne i32 %val, 0
Expand Down
22 changes: 22 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@
; GCN-O0-NEXT: SI Pre-allocate WWM Registers
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: AMDGPU Reserve WWM Registers
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI Fix VGPR copies
; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis
; GCN-O0-NEXT: Fixup Statepoint Caller Saved
Expand Down Expand Up @@ -370,6 +372,11 @@
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: AMDGPU Reserve WWM Registers
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: GCN NSA Reassign
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load
Expand Down Expand Up @@ -673,6 +680,11 @@
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: AMDGPU Reserve WWM Registers
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load
Expand Down Expand Up @@ -982,6 +994,11 @@
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: AMDGPU Reserve WWM Registers
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: GCN NSA Reassign
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load
Expand Down Expand Up @@ -1303,6 +1320,11 @@
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: AMDGPU Reserve WWM Registers
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: GCN NSA Reassign
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load
Expand Down
75 changes: 37 additions & 38 deletions llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8759,11 +8759,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; GFX8-NEXT: s_add_u32 s88, s88, s9
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s0, s3, 8
; GFX8-NEXT: v_writelane_b32 v44, s0, 0
; GFX8-NEXT: v_writelane_b32 v44, s1, 1
; GFX8-NEXT: v_writelane_b32 v62, s0, 0
; GFX8-NEXT: v_writelane_b32 v62, s1, 1
; GFX8-NEXT: s_lshr_b32 s0, s2, 1
; GFX8-NEXT: s_lshr_b32 s36, s3, 21
; GFX8-NEXT: s_lshr_b32 s30, s3, 19
Expand All @@ -8789,7 +8789,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_lshr_b32 s54, s3, 10
; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX8-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX8-NEXT: v_writelane_b32 v44, s0, 2
; GFX8-NEXT: v_writelane_b32 v62, s0, 2
; GFX8-NEXT: s_lshr_b32 s52, s3, 11
; GFX8-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
; GFX8-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
Expand All @@ -8814,7 +8814,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[30:31], s[44:45], 0x10000
; GFX8-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x10000
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: v_writelane_b32 v44, s1, 3
; GFX8-NEXT: v_writelane_b32 v62, s1, 3
; GFX8-NEXT: s_lshr_b32 s6, s3, 9
; GFX8-NEXT: s_lshr_b32 s8, s3, 6
; GFX8-NEXT: s_lshr_b32 s10, s3, 7
Expand All @@ -8830,7 +8830,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v4, s74
; GFX8-NEXT: v_mov_b32_e32 v8, s72
; GFX8-NEXT: v_mov_b32_e32 v0, s70
; GFX8-NEXT: v_mov_b32_e32 v55, s68
; GFX8-NEXT: v_mov_b32_e32 v54, s68
; GFX8-NEXT: v_mov_b32_e32 v20, s66
; GFX8-NEXT: v_mov_b32_e32 v16, s64
; GFX8-NEXT: v_mov_b32_e32 v24, s62
Expand All @@ -8851,7 +8851,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s46
; GFX8-NEXT: s_lshr_b32 s70, s2, 21
; GFX8-NEXT: s_lshr_b32 s68, s2, 18
; GFX8-NEXT: v_mov_b32_e32 v57, s42
; GFX8-NEXT: v_mov_b32_e32 v56, s42
; GFX8-NEXT: s_lshr_b32 s66, s2, 19
; GFX8-NEXT: s_lshr_b32 s64, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v22, s40
Expand All @@ -8876,16 +8876,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_lshr_b32 s36, s2, 2
; GFX8-NEXT: s_lshr_b32 s30, s2, 3
; GFX8-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x10000
; GFX8-NEXT: v_readlane_b32 s2, v44, 0
; GFX8-NEXT: v_readlane_b32 s3, v44, 1
; GFX8-NEXT: v_readlane_b32 s2, v62, 0
; GFX8-NEXT: v_readlane_b32 s3, v62, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s75
; GFX8-NEXT: v_mov_b32_e32 v7, s51
; GFX8-NEXT: v_mov_b32_e32 v9, s73
; GFX8-NEXT: v_mov_b32_e32 v11, s49
; GFX8-NEXT: v_mov_b32_e32 v1, s71
; GFX8-NEXT: v_mov_b32_e32 v3, s47
; GFX8-NEXT: v_mov_b32_e32 v56, s69
; GFX8-NEXT: v_mov_b32_e32 v58, s43
; GFX8-NEXT: v_mov_b32_e32 v55, s69
; GFX8-NEXT: v_mov_b32_e32 v57, s43
; GFX8-NEXT: v_mov_b32_e32 v21, s67
; GFX8-NEXT: v_mov_b32_e32 v23, s41
; GFX8-NEXT: v_mov_b32_e32 v17, s65
Expand Down Expand Up @@ -8942,24 +8942,24 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v42, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x1e0
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v46, s3
; GFX8-NEXT: v_mov_b32_e32 v45, s2
; GFX8-NEXT: v_mov_b32_e32 v45, s3
; GFX8-NEXT: v_mov_b32_e32 v44, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x1d0
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v48, s3
; GFX8-NEXT: v_mov_b32_e32 v47, s2
; GFX8-NEXT: v_mov_b32_e32 v47, s3
; GFX8-NEXT: v_mov_b32_e32 v46, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v50, s3
; GFX8-NEXT: v_mov_b32_e32 v49, s2
; GFX8-NEXT: v_mov_b32_e32 v49, s3
; GFX8-NEXT: v_mov_b32_e32 v48, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x1b0
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v52, s3
; GFX8-NEXT: v_mov_b32_e32 v51, s2
; GFX8-NEXT: v_mov_b32_e32 v51, s3
; GFX8-NEXT: v_mov_b32_e32 v50, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v54, s3
; GFX8-NEXT: v_mov_b32_e32 v53, s2
; GFX8-NEXT: v_mov_b32_e32 v53, s3
; GFX8-NEXT: v_mov_b32_e32 v52, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x190
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v15, s3
Expand All @@ -8971,26 +8971,26 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[8:11]
; GFX8-NEXT: flat_store_dwordx4 v[47:48], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[49:50], v[55:58]
; GFX8-NEXT: flat_store_dwordx4 v[51:52], v[20:23]
; GFX8-NEXT: flat_store_dwordx4 v[53:54], v[16:19]
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[8:11]
; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[54:57]
; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[20:23]
; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[16:19]
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27]
; GFX8-NEXT: buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; GFX8-NEXT: s_add_u32 s2, s4, 0x170
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v60, s3
; GFX8-NEXT: v_mov_b32_e32 v59, s2
; GFX8-NEXT: v_mov_b32_e32 v59, s3
; GFX8-NEXT: v_mov_b32_e32 v58, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x160
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v62, s3
; GFX8-NEXT: v_mov_b32_e32 v61, s2
; GFX8-NEXT: v_mov_b32_e32 v61, s3
; GFX8-NEXT: v_mov_b32_e32 v60, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x150
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v46, s3
; GFX8-NEXT: v_mov_b32_e32 v45, s2
; GFX8-NEXT: v_mov_b32_e32 v45, s3
; GFX8-NEXT: v_mov_b32_e32 v44, s2
; GFX8-NEXT: s_add_u32 s2, s4, 0x140
; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v6, s0
Expand Down Expand Up @@ -9021,9 +9021,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v11, s15
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[28:31]
; GFX8-NEXT: flat_store_dwordx4 v[59:60], v[32:35]
; GFX8-NEXT: flat_store_dwordx4 v[61:62], v[36:39]
; GFX8-NEXT: flat_store_dwordx4 v[45:46], v[40:43]
; GFX8-NEXT: flat_store_dwordx4 v[58:59], v[32:35]
; GFX8-NEXT: flat_store_dwordx4 v[60:61], v[36:39]
; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[40:43]
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
Expand Down Expand Up @@ -9177,17 +9177,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s30
; GFX8-NEXT: v_mov_b32_e32 v3, s31
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_readlane_b32 s0, v44, 2
; GFX8-NEXT: v_readlane_b32 s0, v62, 2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_readlane_b32 s1, v44, 3
; GFX8-NEXT: v_readlane_b32 s1, v62, 3
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s18
; GFX8-NEXT: v_mov_b32_e32 v1, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: ; kill: killed $vgpr44
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v64i1_to_v64i64:
Expand Down
160 changes: 101 additions & 59 deletions llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1520,9 +1520,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x201000
; GFX7-NEXT: s_add_i32 s6, s32, 0x202000
; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_add_i32 s6, s32, 0x201200
; GFX7-NEXT: s_add_i32 s6, s32, 0x202100
; GFX7-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: v_writelane_b32 v23, s28, 28
Expand Down Expand Up @@ -1562,36 +1562,57 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: ; implicit-def: $vgpr22
; GFX7-NEXT: v_writelane_b32 v23, s59, 27
; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32
; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040
; GFX7-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1
; GFX7-NEXT: s_or_saveexec_b64 s[28:29], -1
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX7-NEXT: v_mov_b32_e32 v0, 0x8044
; GFX7-NEXT: buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
; GFX7-NEXT: s_mov_b64 exec, s[28:29]
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: v_lshr_b32_e64 v22, s32, 6
; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040
; GFX7-NEXT: v_add_i32_e32 v22, vcc, vcc_lo, v22
; GFX7-NEXT: v_add_i32_e32 v22, vcc, 0x200, v22
; GFX7-NEXT: v_readfirstlane_b32 s59, v22
; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0
; GFX7-NEXT: v_writelane_b32 v23, s59, 27
; GFX7-NEXT: v_readfirstlane_b32 s59, v0
; GFX7-NEXT: s_and_b64 vcc, 0, exec
; GFX7-NEXT: s_mov_b64 s[28:29], exec
; GFX7-NEXT: s_mov_b64 exec, -1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX7-NEXT: v_mov_b32_e32 v0, 0x8044
; GFX7-NEXT: buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[28:29]
; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_readlane_b32 vcc_lo, v22, 0
; GFX7-NEXT: v_readlane_b32 vcc_hi, v22, 1
; GFX7-NEXT: s_mov_b64 s[28:29], exec
; GFX7-NEXT: s_mov_b64 exec, -1
; GFX7-NEXT: s_mov_b64 exec, s[28:29]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32
; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040
; GFX7-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
Expand Down Expand Up @@ -1624,13 +1645,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: v_readlane_b32 s33, v23, 2
; GFX7-NEXT: v_readlane_b32 s31, v23, 1
; GFX7-NEXT: v_readlane_b32 s30, v23, 0
; GFX7-NEXT: ; kill: killed $vgpr22
; GFX7-NEXT: v_readlane_b32 s28, v23, 28
; GFX7-NEXT: v_readlane_b32 s29, v23, 29
; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX7-NEXT: s_add_i32 s6, s32, 0x201000
; GFX7-NEXT: s_add_i32 s6, s32, 0x202000
; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX7-NEXT: s_add_i32 s6, s32, 0x201200
; GFX7-NEXT: s_add_i32 s6, s32, 0x202100
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1640,9 +1660,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: s_add_i32 s6, s32, 0x202000
; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_add_i32 s6, s32, 0x201200
; GFX8-NEXT: s_add_i32 s6, s32, 0x202100
; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: v_writelane_b32 v23, s58, 28
Expand Down Expand Up @@ -1682,36 +1702,60 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: ; implicit-def: $vgpr22
; GFX8-NEXT: v_writelane_b32 v23, s59, 27
; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32
; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040
; GFX8-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1
; GFX8-NEXT: s_or_saveexec_b64 s[58:59], -1
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_mov_b32_e32 v0, 0x8044
; GFX8-NEXT: buffer_store_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[58:59]
; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_lshrrev_b32_e64 v22, 6, s32
; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
; GFX8-NEXT: v_add_u32_e32 v22, vcc, vcc_lo, v22
; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0x200, v22
; GFX8-NEXT: v_readfirstlane_b32 s59, v22
; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0
; GFX8-NEXT: v_writelane_b32 v23, s59, 27
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: s_and_b64 vcc, 0, exec
; GFX8-NEXT: s_mov_b64 s[58:59], exec
; GFX8-NEXT: s_mov_b64 exec, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], s32
; GFX8-NEXT: v_mov_b32_e32 v0, 0x8044
; GFX8-NEXT: buffer_load_dword v22, v0, s[0:3], s32 offen ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[58:59]
; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_readlane_b32 vcc_lo, v22, 0
; GFX8-NEXT: v_readlane_b32 vcc_hi, v22, 1
; GFX8-NEXT: s_mov_b64 s[58:59], exec
; GFX8-NEXT: s_mov_b64 exec, -1
; GFX8-NEXT: s_mov_b64 exec, s[58:59]
; GFX8-NEXT: v_readlane_b32 s58, v23, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32
; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040
; GFX8-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload
; GFX8-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc
Expand Down Expand Up @@ -1744,13 +1788,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: v_readlane_b32 s33, v23, 2
; GFX8-NEXT: v_readlane_b32 s31, v23, 1
; GFX8-NEXT: v_readlane_b32 s30, v23, 0
; GFX8-NEXT: ; kill: killed $vgpr22
; GFX8-NEXT: v_readlane_b32 s58, v23, 28
; GFX8-NEXT: v_readlane_b32 s59, v23, 29
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: s_add_i32 s6, s32, 0x202000
; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX8-NEXT: s_add_i32 s6, s32, 0x201200
; GFX8-NEXT: s_add_i32 s6, s32, 0x202100
; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
Expand Down
637 changes: 307 additions & 330 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll

Large diffs are not rendered by default.

721 changes: 346 additions & 375 deletions llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,17 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %26
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
;
; PEI-GFX908-LABEL: name: partial_copy
Expand Down Expand Up @@ -60,18 +56,15 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %25
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
;
Expand Down
1,086 changes: 517 additions & 569 deletions llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll

Large diffs are not rendered by default.

44 changes: 14 additions & 30 deletions llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain-preserve.mir
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,27 @@ machineFunctionInfo:
isChainFunction: true
returnsVoid: true
wwmReservedRegs:
- '$vgpr11'
- '$vgpr10'
body: |
bb.0:
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN-LABEL: name: preserve_all_lanes_wwm_above_args
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST killed $vgpr11, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
; GCN-NEXT: renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: $vgpr8 = COPY killed renamable $vgpr10
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: $vgpr8 = COPY killed $vgpr0
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr10 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr11 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
renamable $vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
$vgpr10 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr10
$sgpr35 = S_MOV_B32 5
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr10, 0
renamable $vgpr10 = V_MOV_B32_e32 10, implicit $exec
$vgpr8 = COPY renamable killed $vgpr10
$vgpr10 = V_MOV_B32_e32 10, implicit $exec
$vgpr8 = COPY killed $vgpr10
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
Expand Down Expand Up @@ -139,23 +131,15 @@ body: |
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN-LABEL: name: preserve_inactive_lanes_wwm_args
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9, $vgpr10
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr10
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr9(tied-def 0) :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr0
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
$sgpr35 = S_MOV_B32 5
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
Expand Down Expand Up @@ -184,7 +168,7 @@ body: |
; GCN-LABEL: name: dont_preserve_if_no_chain_calls
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
Expand Down Expand Up @@ -218,7 +202,7 @@ body: |
; GCN-LABEL: name: dont_preserve_v0_v7
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
Expand Down
32 changes: 8 additions & 24 deletions llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,11 @@ body: |
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-LABEL: name: preserve_inactive_wwm
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN: liveins: $sgpr0, $sgpr35
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
Expand All @@ -72,24 +64,16 @@ body: |
; GCN-LABEL: name: preserve_inactive_detected_wwm
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_ST $vgpr9, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr8(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $vgpr9 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
$sgpr35 = S_MOV_B32 5
Expand Down Expand Up @@ -122,7 +106,7 @@ body: |
; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls
; GCN: liveins: $sgpr35, $vgpr8
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
Expand Down Expand Up @@ -151,11 +135,11 @@ body: |
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
; GCN: liveins: $sgpr0, $sgpr35
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
Expand Down Expand Up @@ -209,7 +193,7 @@ body: |
; GCN-LABEL: name: dont_preserve_v0_v7
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
; GCN-NEXT: {{ $}}
; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/pr51516.mir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,1 -o - %s | FileCheck -check-prefix=GCN %s
# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs -start-before=machine-scheduler -stop-after=virtregrewriter,2 -o - %s | FileCheck -check-prefix=GCN %s

# Check that %3 was not rematerialized before the last store since its operand %1
# is killed by that store.
Expand Down
8 changes: 1 addition & 7 deletions llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,10 @@ body: |
; GCN-LABEL: name: preserve_scratch_vgpr_inactive_lanes
; GCN: liveins: $sgpr35, $vgpr0, $sgpr30_sgpr31
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
$sgpr35 = S_MOV_B32 5
Expand Down
714 changes: 355 additions & 359 deletions llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,1 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s
# RUN: not llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs=0 -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -start-before=greedy,1 -stop-after=virtregrewriter,2 %s -o /dev/null 2>&1 | FileCheck -check-prefixes=ERR,VERIFIER %s

# FIXME: We should not produce a verifier error after erroring

Expand Down
2,851 changes: 1,406 additions & 1,445 deletions llvm/test/CodeGen/AMDGPU/rem_i128.ll

Large diffs are not rendered by default.

3,090 changes: 1,548 additions & 1,542 deletions llvm/test/CodeGen/AMDGPU/remat-vop.mir

Large diffs are not rendered by default.

459 changes: 226 additions & 233 deletions llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll

Large diffs are not rendered by default.

30 changes: 26 additions & 4 deletions llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
; REQUIRES: asserts

; RUN: llc -verify-machineinstrs=0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s
; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=greedy -wwm-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s

; RUN: llc -verify-machineinstrs=0 -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s

; RUN: llc -verify-machineinstrs=0 -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s
; RUN: llc -verify-machineinstrs=0 -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s
; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s
; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s
; RUN: llc -verify-machineinstrs=0 -sgpr-regalloc=basic -wwm-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s

; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s
; RUN: not --crash llc -verify-machineinstrs=0 -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s


; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc
; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, and -vgpr-regalloc

; DEFAULT: Greedy Register Allocator
; DEFAULT-NEXT: Virtual Register Rewriter
Expand All @@ -23,6 +23,11 @@
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: AMDGPU Reserve WWM Registers
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: GCN NSA Reassign
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
Expand All @@ -37,6 +42,8 @@
; O0-NEXT: SI Pre-allocate WWM Registers
; O0-NEXT: Fast Register Allocator
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: AMDGPU Reserve WWM Registers
; O0-NEXT: Fast Register Allocator
; O0-NEXT: SI Fix VGPR copies


Expand All @@ -60,6 +67,11 @@
; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: AMDGPU Reserve WWM Registers
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
Expand All @@ -75,6 +87,11 @@
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: AMDGPU Reserve WWM Registers
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
Expand All @@ -96,6 +113,11 @@
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: AMDGPU Reserve WWM Registers
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: GCN NSA Reassign
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
Expand Down
18 changes: 8 additions & 10 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -passes=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s
Expand Down Expand Up @@ -45,28 +46,25 @@ body: |
; SGPR_SPILL: bb.0:
; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000)
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0
; SGPR_SPILL-NEXT: {{ $}}
; SGPR_SPILL-NEXT: bb.1:
; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[V_WRITELANE_B32_]], 0
; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]]
; SGPR_SPILL-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
; SGPR_SPILL-NEXT: S_ENDPGM 0
;
; PEI-LABEL: name: test
; PEI: bb.0:
; PEI-NEXT: successors: %bb.1(0x80000000)
; PEI-NEXT: {{ $}}
; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF
; PEI-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0
; PEI-NEXT: $vgpr0 = IMPLICIT_DEF
; PEI-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: bb.1:
; PEI-NEXT: liveins: $vgpr0
; PEI-NEXT: {{ $}}
; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
; PEI-NEXT: KILL killed renamable $vgpr0
; PEI-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
; PEI-NEXT: S_ENDPGM 0
bb.0:
renamable $sgpr10 = IMPLICIT_DEF
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
call void asm sideeffect "", "~{v[8:15]}" () #0
call void asm sideeffect "", "~{v[16:19]}"() #0
call void asm sideeffect "", "~{v[20:21]}"() #0
call void asm sideeffect "", "~{v22}"() #0
%val0 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
%val1 = call <4 x i32> asm sideeffect "; def $0", "=s" () #0
%val2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
Expand Down
295 changes: 138 additions & 157 deletions llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN: ; %bb.0:
; GCN-NEXT: s_add_u32 s0, s0, s13
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
; GCN-NEXT: s_load_dword s4, s[6:7], 0x2
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
Expand All @@ -31,201 +21,192 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_writelane_b32 v1, s8, 0
; GCN-NEXT: v_writelane_b32 v1, s9, 1
; GCN-NEXT: v_writelane_b32 v1, s10, 2
; GCN-NEXT: v_writelane_b32 v1, s11, 3
; GCN-NEXT: v_writelane_b32 v1, s12, 4
; GCN-NEXT: v_writelane_b32 v1, s13, 5
; GCN-NEXT: v_writelane_b32 v1, s14, 6
; GCN-NEXT: v_writelane_b32 v1, s15, 7
; GCN-NEXT: v_writelane_b32 v1, s16, 8
; GCN-NEXT: v_writelane_b32 v1, s17, 9
; GCN-NEXT: v_writelane_b32 v1, s18, 10
; GCN-NEXT: v_writelane_b32 v1, s19, 11
; GCN-NEXT: v_writelane_b32 v1, s20, 12
; GCN-NEXT: v_writelane_b32 v1, s21, 13
; GCN-NEXT: v_writelane_b32 v1, s22, 14
; GCN-NEXT: v_writelane_b32 v1, s23, 15
; GCN-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; GCN-NEXT: v_writelane_b32 v22, s8, 0
; GCN-NEXT: v_writelane_b32 v22, s9, 1
; GCN-NEXT: v_writelane_b32 v22, s10, 2
; GCN-NEXT: v_writelane_b32 v22, s11, 3
; GCN-NEXT: v_writelane_b32 v22, s12, 4
; GCN-NEXT: v_writelane_b32 v22, s13, 5
; GCN-NEXT: v_writelane_b32 v22, s14, 6
; GCN-NEXT: v_writelane_b32 v22, s15, 7
; GCN-NEXT: v_writelane_b32 v22, s16, 8
; GCN-NEXT: v_writelane_b32 v22, s17, 9
; GCN-NEXT: v_writelane_b32 v22, s18, 10
; GCN-NEXT: v_writelane_b32 v22, s19, 11
; GCN-NEXT: v_writelane_b32 v22, s20, 12
; GCN-NEXT: v_writelane_b32 v22, s21, 13
; GCN-NEXT: v_writelane_b32 v22, s22, 14
; GCN-NEXT: v_writelane_b32 v22, s23, 15
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 16
; GCN-NEXT: v_writelane_b32 v1, s9, 17
; GCN-NEXT: v_writelane_b32 v1, s10, 18
; GCN-NEXT: v_writelane_b32 v1, s11, 19
; GCN-NEXT: v_writelane_b32 v1, s12, 20
; GCN-NEXT: v_writelane_b32 v1, s13, 21
; GCN-NEXT: v_writelane_b32 v1, s14, 22
; GCN-NEXT: v_writelane_b32 v1, s15, 23
; GCN-NEXT: v_writelane_b32 v1, s16, 24
; GCN-NEXT: v_writelane_b32 v1, s17, 25
; GCN-NEXT: v_writelane_b32 v1, s18, 26
; GCN-NEXT: v_writelane_b32 v1, s19, 27
; GCN-NEXT: v_writelane_b32 v1, s20, 28
; GCN-NEXT: v_writelane_b32 v1, s21, 29
; GCN-NEXT: v_writelane_b32 v1, s22, 30
; GCN-NEXT: v_writelane_b32 v1, s23, 31
; GCN-NEXT: v_writelane_b32 v22, s8, 16
; GCN-NEXT: v_writelane_b32 v22, s9, 17
; GCN-NEXT: v_writelane_b32 v22, s10, 18
; GCN-NEXT: v_writelane_b32 v22, s11, 19
; GCN-NEXT: v_writelane_b32 v22, s12, 20
; GCN-NEXT: v_writelane_b32 v22, s13, 21
; GCN-NEXT: v_writelane_b32 v22, s14, 22
; GCN-NEXT: v_writelane_b32 v22, s15, 23
; GCN-NEXT: v_writelane_b32 v22, s16, 24
; GCN-NEXT: v_writelane_b32 v22, s17, 25
; GCN-NEXT: v_writelane_b32 v22, s18, 26
; GCN-NEXT: v_writelane_b32 v22, s19, 27
; GCN-NEXT: v_writelane_b32 v22, s20, 28
; GCN-NEXT: v_writelane_b32 v22, s21, 29
; GCN-NEXT: v_writelane_b32 v22, s22, 30
; GCN-NEXT: v_writelane_b32 v22, s23, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 32
; GCN-NEXT: v_writelane_b32 v1, s9, 33
; GCN-NEXT: v_writelane_b32 v1, s10, 34
; GCN-NEXT: v_writelane_b32 v1, s11, 35
; GCN-NEXT: v_writelane_b32 v1, s12, 36
; GCN-NEXT: v_writelane_b32 v1, s13, 37
; GCN-NEXT: v_writelane_b32 v1, s14, 38
; GCN-NEXT: v_writelane_b32 v1, s15, 39
; GCN-NEXT: v_writelane_b32 v1, s16, 40
; GCN-NEXT: v_writelane_b32 v1, s17, 41
; GCN-NEXT: v_writelane_b32 v1, s18, 42
; GCN-NEXT: v_writelane_b32 v1, s19, 43
; GCN-NEXT: v_writelane_b32 v1, s20, 44
; GCN-NEXT: v_writelane_b32 v1, s21, 45
; GCN-NEXT: v_writelane_b32 v1, s22, 46
; GCN-NEXT: v_writelane_b32 v1, s23, 47
; GCN-NEXT: v_writelane_b32 v22, s8, 32
; GCN-NEXT: v_writelane_b32 v22, s9, 33
; GCN-NEXT: v_writelane_b32 v22, s10, 34
; GCN-NEXT: v_writelane_b32 v22, s11, 35
; GCN-NEXT: v_writelane_b32 v22, s12, 36
; GCN-NEXT: v_writelane_b32 v22, s13, 37
; GCN-NEXT: v_writelane_b32 v22, s14, 38
; GCN-NEXT: v_writelane_b32 v22, s15, 39
; GCN-NEXT: v_writelane_b32 v22, s16, 40
; GCN-NEXT: v_writelane_b32 v22, s17, 41
; GCN-NEXT: v_writelane_b32 v22, s18, 42
; GCN-NEXT: v_writelane_b32 v22, s19, 43
; GCN-NEXT: v_writelane_b32 v22, s20, 44
; GCN-NEXT: v_writelane_b32 v22, s21, 45
; GCN-NEXT: v_writelane_b32 v22, s22, 46
; GCN-NEXT: v_writelane_b32 v22, s23, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_writelane_b32 v1, s8, 48
; GCN-NEXT: v_writelane_b32 v1, s9, 49
; GCN-NEXT: v_writelane_b32 v1, s10, 50
; GCN-NEXT: v_writelane_b32 v1, s11, 51
; GCN-NEXT: v_writelane_b32 v1, s12, 52
; GCN-NEXT: v_writelane_b32 v1, s13, 53
; GCN-NEXT: v_writelane_b32 v1, s14, 54
; GCN-NEXT: v_writelane_b32 v1, s15, 55
; GCN-NEXT: v_writelane_b32 v1, s16, 56
; GCN-NEXT: v_writelane_b32 v1, s17, 57
; GCN-NEXT: v_writelane_b32 v1, s18, 58
; GCN-NEXT: v_writelane_b32 v1, s19, 59
; GCN-NEXT: v_writelane_b32 v1, s20, 60
; GCN-NEXT: v_writelane_b32 v1, s21, 61
; GCN-NEXT: v_writelane_b32 v1, s22, 62
; GCN-NEXT: v_writelane_b32 v1, s23, 63
; GCN-NEXT: v_writelane_b32 v22, s8, 48
; GCN-NEXT: v_writelane_b32 v22, s9, 49
; GCN-NEXT: v_writelane_b32 v22, s10, 50
; GCN-NEXT: v_writelane_b32 v22, s11, 51
; GCN-NEXT: v_writelane_b32 v22, s12, 52
; GCN-NEXT: v_writelane_b32 v22, s13, 53
; GCN-NEXT: v_writelane_b32 v22, s14, 54
; GCN-NEXT: v_writelane_b32 v22, s15, 55
; GCN-NEXT: v_writelane_b32 v22, s16, 56
; GCN-NEXT: v_writelane_b32 v22, s17, 57
; GCN-NEXT: v_writelane_b32 v22, s18, 58
; GCN-NEXT: v_writelane_b32 v22, s19, 59
; GCN-NEXT: v_writelane_b32 v22, s20, 60
; GCN-NEXT: v_writelane_b32 v22, s21, 61
; GCN-NEXT: v_writelane_b32 v22, s22, 62
; GCN-NEXT: v_writelane_b32 v22, s23, 63
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; def s[6:7]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_writelane_b32 v0, s6, 0
; GCN-NEXT: v_writelane_b32 v0, s7, 1
; GCN-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
; GCN-NEXT: v_writelane_b32 v22, s6, 0
; GCN-NEXT: v_writelane_b32 v22, s7, 1
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v22, off, s[0:3], 0 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, s5
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb0
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v1, 0
; GCN-NEXT: v_readlane_b32 s5, v1, 1
; GCN-NEXT: v_readlane_b32 s6, v1, 2
; GCN-NEXT: v_readlane_b32 s7, v1, 3
; GCN-NEXT: v_readlane_b32 s8, v1, 4
; GCN-NEXT: v_readlane_b32 s9, v1, 5
; GCN-NEXT: v_readlane_b32 s10, v1, 6
; GCN-NEXT: v_readlane_b32 s11, v1, 7
; GCN-NEXT: v_readlane_b32 s12, v1, 8
; GCN-NEXT: v_readlane_b32 s13, v1, 9
; GCN-NEXT: v_readlane_b32 s14, v1, 10
; GCN-NEXT: v_readlane_b32 s15, v1, 11
; GCN-NEXT: v_readlane_b32 s16, v1, 12
; GCN-NEXT: v_readlane_b32 s17, v1, 13
; GCN-NEXT: v_readlane_b32 s18, v1, 14
; GCN-NEXT: v_readlane_b32 s19, v1, 15
; GCN-NEXT: v_readlane_b32 s4, v23, 0
; GCN-NEXT: v_readlane_b32 s5, v23, 1
; GCN-NEXT: v_readlane_b32 s6, v23, 2
; GCN-NEXT: v_readlane_b32 s7, v23, 3
; GCN-NEXT: v_readlane_b32 s8, v23, 4
; GCN-NEXT: v_readlane_b32 s9, v23, 5
; GCN-NEXT: v_readlane_b32 s10, v23, 6
; GCN-NEXT: v_readlane_b32 s11, v23, 7
; GCN-NEXT: v_readlane_b32 s12, v23, 8
; GCN-NEXT: v_readlane_b32 s13, v23, 9
; GCN-NEXT: v_readlane_b32 s14, v23, 10
; GCN-NEXT: v_readlane_b32 s15, v23, 11
; GCN-NEXT: v_readlane_b32 s16, v23, 12
; GCN-NEXT: v_readlane_b32 s17, v23, 13
; GCN-NEXT: v_readlane_b32 s18, v23, 14
; GCN-NEXT: v_readlane_b32 s19, v23, 15
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v1, 16
; GCN-NEXT: v_readlane_b32 s5, v1, 17
; GCN-NEXT: v_readlane_b32 s6, v1, 18
; GCN-NEXT: v_readlane_b32 s7, v1, 19
; GCN-NEXT: v_readlane_b32 s8, v1, 20
; GCN-NEXT: v_readlane_b32 s9, v1, 21
; GCN-NEXT: v_readlane_b32 s10, v1, 22
; GCN-NEXT: v_readlane_b32 s11, v1, 23
; GCN-NEXT: v_readlane_b32 s12, v1, 24
; GCN-NEXT: v_readlane_b32 s13, v1, 25
; GCN-NEXT: v_readlane_b32 s14, v1, 26
; GCN-NEXT: v_readlane_b32 s15, v1, 27
; GCN-NEXT: v_readlane_b32 s16, v1, 28
; GCN-NEXT: v_readlane_b32 s17, v1, 29
; GCN-NEXT: v_readlane_b32 s18, v1, 30
; GCN-NEXT: v_readlane_b32 s19, v1, 31
; GCN-NEXT: v_readlane_b32 s4, v23, 16
; GCN-NEXT: v_readlane_b32 s5, v23, 17
; GCN-NEXT: v_readlane_b32 s6, v23, 18
; GCN-NEXT: v_readlane_b32 s7, v23, 19
; GCN-NEXT: v_readlane_b32 s8, v23, 20
; GCN-NEXT: v_readlane_b32 s9, v23, 21
; GCN-NEXT: v_readlane_b32 s10, v23, 22
; GCN-NEXT: v_readlane_b32 s11, v23, 23
; GCN-NEXT: v_readlane_b32 s12, v23, 24
; GCN-NEXT: v_readlane_b32 s13, v23, 25
; GCN-NEXT: v_readlane_b32 s14, v23, 26
; GCN-NEXT: v_readlane_b32 s15, v23, 27
; GCN-NEXT: v_readlane_b32 s16, v23, 28
; GCN-NEXT: v_readlane_b32 s17, v23, 29
; GCN-NEXT: v_readlane_b32 s18, v23, 30
; GCN-NEXT: v_readlane_b32 s19, v23, 31
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s4, v1, 32
; GCN-NEXT: v_readlane_b32 s5, v1, 33
; GCN-NEXT: v_readlane_b32 s6, v1, 34
; GCN-NEXT: v_readlane_b32 s7, v1, 35
; GCN-NEXT: v_readlane_b32 s8, v1, 36
; GCN-NEXT: v_readlane_b32 s9, v1, 37
; GCN-NEXT: v_readlane_b32 s10, v1, 38
; GCN-NEXT: v_readlane_b32 s11, v1, 39
; GCN-NEXT: v_readlane_b32 s12, v1, 40
; GCN-NEXT: v_readlane_b32 s13, v1, 41
; GCN-NEXT: v_readlane_b32 s14, v1, 42
; GCN-NEXT: v_readlane_b32 s15, v1, 43
; GCN-NEXT: v_readlane_b32 s16, v1, 44
; GCN-NEXT: v_readlane_b32 s17, v1, 45
; GCN-NEXT: v_readlane_b32 s18, v1, 46
; GCN-NEXT: v_readlane_b32 s19, v1, 47
; GCN-NEXT: v_readlane_b32 s4, v23, 32
; GCN-NEXT: v_readlane_b32 s5, v23, 33
; GCN-NEXT: v_readlane_b32 s6, v23, 34
; GCN-NEXT: v_readlane_b32 s7, v23, 35
; GCN-NEXT: v_readlane_b32 s8, v23, 36
; GCN-NEXT: v_readlane_b32 s9, v23, 37
; GCN-NEXT: v_readlane_b32 s10, v23, 38
; GCN-NEXT: v_readlane_b32 s11, v23, 39
; GCN-NEXT: v_readlane_b32 s12, v23, 40
; GCN-NEXT: v_readlane_b32 s13, v23, 41
; GCN-NEXT: v_readlane_b32 s14, v23, 42
; GCN-NEXT: v_readlane_b32 s15, v23, 43
; GCN-NEXT: v_readlane_b32 s16, v23, 44
; GCN-NEXT: v_readlane_b32 s17, v23, 45
; GCN-NEXT: v_readlane_b32 s18, v23, 46
; GCN-NEXT: v_readlane_b32 s19, v23, 47
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:19]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_readlane_b32 s8, v1, 48
; GCN-NEXT: v_readlane_b32 s9, v1, 49
; GCN-NEXT: v_readlane_b32 s10, v1, 50
; GCN-NEXT: v_readlane_b32 s11, v1, 51
; GCN-NEXT: v_readlane_b32 s12, v1, 52
; GCN-NEXT: v_readlane_b32 s13, v1, 53
; GCN-NEXT: v_readlane_b32 s14, v1, 54
; GCN-NEXT: v_readlane_b32 s15, v1, 55
; GCN-NEXT: v_readlane_b32 s16, v1, 56
; GCN-NEXT: v_readlane_b32 s17, v1, 57
; GCN-NEXT: v_readlane_b32 s18, v1, 58
; GCN-NEXT: v_readlane_b32 s19, v1, 59
; GCN-NEXT: v_readlane_b32 s20, v1, 60
; GCN-NEXT: v_readlane_b32 s21, v1, 61
; GCN-NEXT: v_readlane_b32 s22, v1, 62
; GCN-NEXT: v_readlane_b32 s23, v1, 63
; GCN-NEXT: v_readlane_b32 s8, v23, 48
; GCN-NEXT: v_readlane_b32 s9, v23, 49
; GCN-NEXT: v_readlane_b32 s10, v23, 50
; GCN-NEXT: v_readlane_b32 s11, v23, 51
; GCN-NEXT: v_readlane_b32 s12, v23, 52
; GCN-NEXT: v_readlane_b32 s13, v23, 53
; GCN-NEXT: v_readlane_b32 s14, v23, 54
; GCN-NEXT: v_readlane_b32 s15, v23, 55
; GCN-NEXT: v_readlane_b32 s16, v23, 56
; GCN-NEXT: v_readlane_b32 s17, v23, 57
; GCN-NEXT: v_readlane_b32 s18, v23, 58
; GCN-NEXT: v_readlane_b32 s19, v23, 59
; GCN-NEXT: v_readlane_b32 s20, v23, 60
; GCN-NEXT: v_readlane_b32 s21, v23, 61
; GCN-NEXT: v_readlane_b32 s22, v23, 62
; GCN-NEXT: v_readlane_b32 s23, v23, 63
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s4, v0, 0
; GCN-NEXT: v_readlane_b32 s5, v0, 1
; GCN-NEXT: v_readlane_b32 s4, v22, 0
; GCN-NEXT: v_readlane_b32 s5, v22, 1
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[8:23]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use s[4:5]
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: .LBB0_2: ; %ret
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[24:25]
; GCN-NEXT: ; kill: killed $vgpr1
; GCN-NEXT: ; kill: killed $vgpr0
; GCN-NEXT: s_endpgm
call void asm sideeffect "", "~{v[0:7]}" () #0
call void asm sideeffect "", "~{v[8:15]}" () #0
call void asm sideeffect "", "~{v[16:19]}"() #0
call void asm sideeffect "", "~{v[20:21]}"() #0
call void asm sideeffect "", "~{v22}"() #0

%wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
Expand Down
Loading