422 changes: 422 additions & 0 deletions llvm/test/CodeGen/AMDGPU/cc-update.ll

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ done:

; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
; GCN: s_and_saveexec_b64
; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}}
; GCN: {{^}}BB4_2:
define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
Expand Down Expand Up @@ -174,9 +174,9 @@ done:
; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
; GCN: s_and_saveexec_b64
; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}}
; GCN: {{^BB[0-9]+}}_2:

define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
Expand Down Expand Up @@ -213,8 +213,8 @@ done:

; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32:
; GCN: s_and_saveexec_b64
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
; GCN: {{^BB[0-9]+}}_2:
define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
entry:
Expand Down
28 changes: 15 additions & 13 deletions llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ define <2 x half> @chain_hi_to_lo_private() {
; GCN-LABEL: chain_hi_to_lo_private:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:2
; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s33
; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -26,9 +26,9 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %ba
; GCN-LABEL: chain_hi_to_lo_private_different_bases:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen
; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
Expand All @@ -46,7 +46,7 @@ define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in)
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1
; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -196,27 +196,29 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: global_load_ushort v4, v[2:3], off
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4
; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4
; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:6
; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6
; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4
; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6
; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, v4
; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8
; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
Expand Down Expand Up @@ -298,10 +300,10 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
; GCN-LABEL: chain_hi_to_lo_private_other_dep:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2
; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ bb.end: ; preds = %bb.then, %bb
; GCN: s_andn2_b64 exec, exec,
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]

; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offen

; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER]], {{vcc|s\[[0-9:]+\]}}
; GCN-NEXT: s_cbranch_execz [[BB1_OUTER_LOOP]]
Expand Down
64 changes: 32 additions & 32 deletions llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@
; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]

; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill

; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

Expand All @@ -40,13 +40,13 @@
; GCN: ; %bb.{{[0-9]+}}: ; %if
; GCN: s_mov_b32 m0, -1
; GCN: ds_read_b32 [[LOAD1:v[0-9]+]]
; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)


; Spill val register
; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]]
; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; VMEM: [[ENDIF]]:

Expand All @@ -56,18 +56,18 @@



; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}

; Restore val
; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
Expand Down Expand Up @@ -102,45 +102,45 @@ endif:
; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]


; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill

; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}

; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]]


; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
; GCN: s_cmp_lg_u32
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN-NEXT: s_cbranch_scc1 [[LOOP]]


; GCN: [[END]]:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
Expand Down Expand Up @@ -179,16 +179,16 @@ end:
; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}

; Spill load
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; Spill saved exec
; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]

; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; GCN: s_mov_b64 exec, [[CMP0]]

Expand All @@ -201,18 +201,18 @@ end:
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]


; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET]]
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]]
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]]

; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}

; Regular spill value restored after exec modification
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload


; Spill saved exec
Expand All @@ -221,44 +221,44 @@ end:


; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]]
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]]
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill

; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]]


; GCN: ; %bb.{{[0-9]+}}: ; %if
; GCN: ds_read_b32
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]

; GCN: [[ELSE]]: ; %else
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN-NEXT: s_branch [[FLOW]]

; GCN: [[ENDIF]]:
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]


; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]

; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload
; VMEM: s_waitcnt vmcnt(0)
; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]

; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}}

; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
entry:
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,15 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
; GCN-LABEL: v3i16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s33, s9
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccz BB4_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_mov_b32 s4, 0
Expand Down Expand Up @@ -213,14 +214,15 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
; GCN-LABEL: v3f16_registers:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s33, s9
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33
; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, 1, s4
; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_cbranch_vccz BB5_2
; GCN-NEXT: ; %bb.1:
; GCN-NEXT: s_mov_b32 s4, 0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/extload-private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s

; FUNC-LABEL: {{^}}load_i8_sext_private:
; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i8, addrspace(5)
Expand All @@ -13,7 +13,7 @@ entry:
}

; FUNC-LABEL: {{^}}load_i8_zext_private:
; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i8, addrspace(5)
Expand All @@ -24,7 +24,7 @@ entry:
}

; FUNC-LABEL: {{^}}load_i16_sext_private:
; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i16, addrspace(5)
Expand All @@ -35,7 +35,7 @@ entry:
}

; FUNC-LABEL: {{^}}load_i16_zext_private:
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i16, addrspace(5)
Expand Down
77 changes: 38 additions & 39 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
Expand All @@ -20,8 +20,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
Expand All @@ -30,8 +30,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX9-LABEL: private_load_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s33 offen
; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen offset:2
; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen
; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -53,8 +53,8 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -64,18 +64,18 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 1
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen
; GFX9-NEXT: v_mov_b32_e32 v0, 2
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen offset:2
; GFX9-NEXT: v_mov_b32_e32 v2, 2
; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
Expand All @@ -89,36 +89,35 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -141,31 +140,31 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)*
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1
; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0
; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_store_2xi16_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
Expand All @@ -186,21 +185,21 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: private_load_2xi16_align4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen
; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -233,7 +232,7 @@ define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)*
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, 0x20001
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
Expand Down
197 changes: 178 additions & 19 deletions llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s

# Kernels can have no FP
---
name: no_fold_fi_non_stack_rsrc_soffset
name: kernel_no_fold_fi_non_stack_rsrc_and_soffset
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
Expand All @@ -12,14 +13,12 @@ stack:
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
scratchWaveOffsetReg: '$sgpr6'
frameOffsetReg: '$sgpr6'
stackPtrOffsetReg: '$sgpr6'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc_and_soffset
; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
Expand All @@ -36,7 +35,7 @@ body: |
...

---
name: no_fold_fi_non_stack_rsrc
name: kernel_no_fold_fi_non_stack_rsrc
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
Expand All @@ -46,14 +45,12 @@ stack:
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
scratchWaveOffsetReg: '$sgpr6'
frameOffsetReg: '$sgpr6'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc
; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
Expand All @@ -68,9 +65,8 @@ body: |
...

# Offset is from global scratch wave offset.
---
name: fold_fi_mubuf_scratch_scratch_wave_offset
name: kernel_no_fold_fi_non_stack_soffset
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
Expand All @@ -80,12 +76,44 @@ stack:
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
scratchWaveOffsetReg: '$sgpr33'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GCN: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
%2:sreg_32_xm0 = S_MOV_B32 0
BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
%3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %3
S_ENDPGM 0, implicit $vgpr0
...

---
name: kernel_fold_fi_mubuf
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GCN-LABEL: name: kernel_fold_fi_mubuf
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
Expand All @@ -94,30 +122,161 @@ body: |
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec
%2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec
BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
%2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %2
S_ENDPGM 0, implicit $vgpr0
...


# Functions have an unswizzled SP/FP relative to the wave offset
---
name: no_fold_fi_mubuf_scratch_sp_offset
name: function_no_fold_fi_non_stack_rsrc_and_soffset
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: true
isEntryFunction: false
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
frameOffsetReg: '$sgpr32'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc_and_soffset
; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
; GCN: SI_RETURN_TO_EPILOG $vgpr0
%0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
%1:sreg_32_xm0 = S_MOV_B32 0
%2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %3
SI_RETURN_TO_EPILOG $vgpr0
...

---
name: function_no_fold_fi_non_stack_rsrc
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
frameOffsetReg: '$sgpr32'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc
; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
; GCN: SI_RETURN_TO_EPILOG $vgpr0
%0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
%2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %3
SI_RETURN_TO_EPILOG $vgpr0
...

---
name: function_no_fold_fi_non_stack_soffset
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
frameOffsetReg: '$sgpr32'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GCN: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
%2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %2
S_ENDPGM 0, implicit $vgpr0
...

---
name: function_fold_fi_mubuf_wave_relative
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
frameOffsetReg: '$sgpr32'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
; GCN: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
%2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec
$vgpr0 = COPY %2
S_ENDPGM 0, implicit $vgpr0
...

---
name: function_fold_fi_mubuf_stack_relative
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
localFrameSize: 4
stack:
- { id: 0, size: 4, alignment: 4, local-offset: 0 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
scratchWaveOffsetReg: '$sgpr33'
frameOffsetReg: '$sgpr32'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec
Expand Down
63 changes: 25 additions & 38 deletions llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
; Materialize into a mov. Make sure there isn't an unnecessary copy.
; GCN-LABEL: {{^}}func_mov_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33

; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
; CI-NEXT: v_lshr_b32_e64 v0, s32, 6
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32

; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
Expand All @@ -24,19 +23,15 @@ define void @func_mov_fi_i32() #0 {
; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-DAG: v_lshr_b32_e64 v0, [[SUB0]], 6
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6
; CI-DAG: v_lshr_b32_e64 v0, s32, 6
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
; CI-NOT: v_mov
; CI: ds_write_b32 v0, v0
; CI-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]]
; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
; CI-NEXT: ds_write_b32 v0, v0

; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]]
; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]]
; GFX9: v_lshrrev_b32_e64 v0, 6, s32
; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9-DAG: ds_write_b32 v0, v0
; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
; GFX9-NEXT: ds_write_b32 v0, v0
Expand All @@ -53,15 +48,13 @@ define void @func_mov_fi_i32_offset() #0 {

; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33

; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]

; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]


; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
define void @func_add_constant_to_fi_i32() #0 {
Expand All @@ -75,11 +68,10 @@ define void @func_add_constant_to_fi_i32() #0 {
; into.

; GCN-LABEL: {{^}}func_other_fi_user_i32:
; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33

; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
; CI: v_lshr_b32_e64 v0, s32, 6

; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]]
; GFX9: v_lshrrev_b32_e64 v0, 6, s32

; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
; GCN-NOT: v_mov
Expand All @@ -94,28 +86,27 @@ define void @func_other_fi_user_i32() #0 {

; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
; GCN: v_mov_b32_e32 v1, 15{{$}}
; GCN: buffer_store_dword v1, v0, s[0:3], s33 offen{{$}}
; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
store volatile i32 15, i32 addrspace(5)* %ptr
ret void
}

; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
; GCN: s_waitcnt
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen{{$}}
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
%val = load volatile i32, i32 addrspace(5)* %ptr
ret void
}

; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
; GCN: s_waitcnt
; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33

; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]

; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]

; GCN-NOT: v_mov
Expand Down Expand Up @@ -143,11 +134,10 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
}

; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33

; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6

; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32

; GCN: s_and_saveexec_b64

Expand Down Expand Up @@ -175,13 +165,12 @@ ret:

; Added offset can't be used with VOP3 add
; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33
; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200

; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]

; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]

; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
Expand All @@ -199,13 +188,12 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
}

; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33
; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200

; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]

; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]

; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
Expand Down Expand Up @@ -256,12 +244,11 @@ bb5:
; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
; GCN: s_and_saveexec_b64
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33

; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]

; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]

; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ machineFunctionInfo:
isEntryFunction: true
waveLimiter: true
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
scratchWaveOffsetReg: '$sgpr101'
frameOffsetReg: '$sgpr101'
stackPtrOffsetReg: '$sgpr32'
argumentInfo:
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ machineFunctionInfo:
isEntryFunction: true
waveLimiter: true
scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
scratchWaveOffsetReg: '$sgpr101'
frameOffsetReg: '$sgpr101'
stackPtrOffsetReg: '$sgpr32'
frameOffsetReg: '$sgpr34'
argumentInfo:
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
dispatchPtr: { reg: '$sgpr4_sgpr5' }
Expand Down
202 changes: 101 additions & 101 deletions llvm/test/CodeGen/AMDGPU/function-returns.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ entry:
}

; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 40
; GFX803: .sgpr_spill_count: 24
; GFX900: .sgpr_spill_count: 24
; GFX1010: .sgpr_spill_count: 24
; GFX700: .sgpr_spill_count: 38
; GFX803: .sgpr_spill_count: 22
; GFX900: .sgpr_spill_count: 22
; GFX1010: .sgpr_spill_count: 22
; CHECK: .symbol: num_spilled_sgprs.kd
define amdgpu_kernel void @num_spilled_sgprs(
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ entry:
; CHECK-LABEL: - Name: num_spilled_sgprs
; CHECK: SymbolName: 'num_spilled_sgprs@kd'
; CHECK: CodeProps:
; GFX700: NumSpilledSGPRs: 40
; GFX803: NumSpilledSGPRs: 24
; GFX900: NumSpilledSGPRs: 24
; GFX700: NumSpilledSGPRs: 38
; GFX803: NumSpilledSGPRs: 22
; GFX900: NumSpilledSGPRs: 22
define amdgpu_kernel void @num_spilled_sgprs(
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
Expand Down
2,277 changes: 1,140 additions & 1,137 deletions llvm/test/CodeGen/AMDGPU/idot8s.ll

Large diffs are not rendered by default.

2,572 changes: 1,286 additions & 1,286 deletions llvm/test/CodeGen/AMDGPU/idot8u.ll

Large diffs are not rendered by default.

104 changes: 52 additions & 52 deletions llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,40 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN: renamable $sgpr2 = COPY renamable $sgpr1
; GCN: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN: renamable $sgpr1 = S_MOV_B32 61440
; GCN: renamable $sgpr4 = S_MOV_B32 -1
; GCN: undef renamable $sgpr8 = COPY killed renamable $sgpr0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
; GCN: renamable $sgpr9 = COPY killed renamable $sgpr2
; GCN: renamable $sgpr10 = COPY killed renamable $sgpr4
; GCN: renamable $sgpr11 = COPY killed renamable $sgpr1
; GCN: renamable $sgpr3 = S_MOV_B32 -1
; GCN: undef renamable $sgpr4 = COPY killed renamable $sgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
; GCN: renamable $sgpr5 = COPY killed renamable $sgpr2
; GCN: renamable $sgpr6 = COPY killed renamable $sgpr3
; GCN: renamable $sgpr7 = COPY killed renamable $sgpr1
; GCN: renamable $sgpr0 = S_MOV_B32 16
; GCN: renamable $sgpr1 = S_MOV_B32 15
; GCN: renamable $sgpr2 = S_MOV_B32 14
; GCN: renamable $sgpr4 = S_MOV_B32 13
; GCN: renamable $sgpr5 = S_MOV_B32 12
; GCN: renamable $sgpr6 = S_MOV_B32 11
; GCN: renamable $sgpr7 = S_MOV_B32 10
; GCN: renamable $sgpr12 = S_MOV_B32 9
; GCN: renamable $sgpr13 = S_MOV_B32 8
; GCN: renamable $sgpr14 = S_MOV_B32 7
; GCN: renamable $sgpr15 = S_MOV_B32 6
; GCN: renamable $sgpr16 = S_MOV_B32 5
; GCN: renamable $sgpr17 = S_MOV_B32 3
; GCN: renamable $sgpr18 = S_MOV_B32 2
; GCN: renamable $sgpr19 = S_MOV_B32 1
; GCN: renamable $sgpr20 = S_MOV_B32 0
; GCN: renamable $vgpr1 = COPY killed renamable $sgpr20
; GCN: renamable $vgpr2 = COPY killed renamable $sgpr19
; GCN: renamable $vgpr3 = COPY killed renamable $sgpr18
; GCN: renamable $vgpr4 = COPY killed renamable $sgpr17
; GCN: renamable $vgpr5 = COPY killed renamable $sgpr16
; GCN: renamable $vgpr6 = COPY killed renamable $sgpr15
; GCN: renamable $vgpr7 = COPY killed renamable $sgpr14
; GCN: renamable $vgpr8 = COPY killed renamable $sgpr13
; GCN: renamable $vgpr9 = COPY killed renamable $sgpr12
; GCN: renamable $vgpr10 = COPY killed renamable $sgpr7
; GCN: renamable $vgpr11 = COPY killed renamable $sgpr6
; GCN: renamable $vgpr12 = COPY killed renamable $sgpr5
; GCN: renamable $vgpr13 = COPY killed renamable $sgpr4
; GCN: renamable $sgpr3 = S_MOV_B32 13
; GCN: renamable $sgpr8 = S_MOV_B32 12
; GCN: renamable $sgpr9 = S_MOV_B32 11
; GCN: renamable $sgpr10 = S_MOV_B32 10
; GCN: renamable $sgpr11 = S_MOV_B32 9
; GCN: renamable $sgpr12 = S_MOV_B32 8
; GCN: renamable $sgpr13 = S_MOV_B32 7
; GCN: renamable $sgpr14 = S_MOV_B32 6
; GCN: renamable $sgpr15 = S_MOV_B32 5
; GCN: renamable $sgpr16 = S_MOV_B32 3
; GCN: renamable $sgpr17 = S_MOV_B32 2
; GCN: renamable $sgpr18 = S_MOV_B32 1
; GCN: renamable $sgpr19 = S_MOV_B32 0
; GCN: renamable $vgpr1 = COPY killed renamable $sgpr19
; GCN: renamable $vgpr2 = COPY killed renamable $sgpr18
; GCN: renamable $vgpr3 = COPY killed renamable $sgpr17
; GCN: renamable $vgpr4 = COPY killed renamable $sgpr16
; GCN: renamable $vgpr5 = COPY killed renamable $sgpr15
; GCN: renamable $vgpr6 = COPY killed renamable $sgpr14
; GCN: renamable $vgpr7 = COPY killed renamable $sgpr13
; GCN: renamable $vgpr8 = COPY killed renamable $sgpr12
; GCN: renamable $vgpr9 = COPY killed renamable $sgpr11
; GCN: renamable $vgpr10 = COPY killed renamable $sgpr10
; GCN: renamable $vgpr11 = COPY killed renamable $sgpr9
; GCN: renamable $vgpr12 = COPY killed renamable $sgpr8
; GCN: renamable $vgpr13 = COPY killed renamable $sgpr3
; GCN: renamable $vgpr14 = COPY killed renamable $sgpr2
; GCN: renamable $vgpr15 = COPY killed renamable $sgpr1
; GCN: renamable $vgpr16 = COPY killed renamable $sgpr0
Expand All @@ -69,44 +69,44 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN: renamable $vgpr30 = COPY killed renamable $vgpr14
; GCN: renamable $vgpr31 = COPY killed renamable $vgpr15
; GCN: renamable $vgpr32 = COPY killed renamable $vgpr16
; GCN: renamable $sgpr22_sgpr23 = S_MOV_B64 $exec
; GCN: renamable $sgpr20_sgpr21 = S_MOV_B64 $exec
; GCN: renamable $vgpr1 = IMPLICIT_DEF
; GCN: renamable $sgpr24_sgpr25 = IMPLICIT_DEF
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
; GCN: SI_SPILL_S128_SAVE killed $sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 16 into %stack.1, align 4, addrspace 5)
; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.3, align 4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
; GCN: renamable $sgpr22_sgpr23 = IMPLICIT_DEF
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
; GCN: SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5)
; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
; GCN: bb.1:
; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5)
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5)
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec
; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec
; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0
; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5)
; GCN: renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0
; GCN: S_SET_GPR_IDX_OFF
; GCN: renamable $vgpr19 = COPY renamable $vgpr18
; GCN: renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5
; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.6, align 4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.6, align 4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
; GCN: bb.3:
; GCN: successors: %bb.2(0x80000000)
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5)
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.3, align 4, addrspace 5)
; GCN: $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1
; GCN: bb.2:
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5)
; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5)
; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
; GCN: S_ENDPGM 0
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
; GCN-NEXT: runtime_loader_kernel_symbol = 0
; GCN-NEXT: .end_amd_kernel_code_t
; GCN-NEXT: ; %bb.0:
; GCN-NEXT: s_mov_b32 s33, s17
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_u32 s12, s12, s33
; GCN-NEXT: s_add_u32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, gv.fptr0@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr0@rel32@hi+4
Expand Down Expand Up @@ -167,11 +168,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
; GCN-NEXT: runtime_loader_kernel_symbol = 0
; GCN-NEXT: .end_amd_kernel_code_t
; GCN-NEXT: ; %bb.0:
; GCN-NEXT: s_mov_b32 s33, s17
; GCN-NEXT: s_mov_b32 s32, s33
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_add_u32 s12, s12, s33
; GCN-NEXT: s_add_u32 s12, s12, s17
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, gv.fptr1@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr1@rel32@hi+4
Expand Down
48 changes: 26 additions & 22 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1620,9 +1620,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; SI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x10
; SI-NEXT: s_load_dword s4, s[4:5], 0x20
; SI-NEXT: s_add_u32 s0, s0, s7
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: v_mov_b32_e32 v16, 64
; SI-NEXT: s_mov_b32 s11, 0x100f000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: s_and_b32 s4, s4, 7
Expand All @@ -1642,18 +1642,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; SI-NEXT: v_mov_b32_e32 v9, s21
; SI-NEXT: v_mov_b32_e32 v10, s22
; SI-NEXT: v_mov_b32_e32 v11, s23
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; SI-NEXT: v_or_b32_e32 v16, s4, v16
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; SI-NEXT: s_mov_b32 s11, 0x100f000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
Expand All @@ -1666,9 +1668,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40
; VI-NEXT: s_load_dword s4, s[4:5], 0x80
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v16, 64
; VI-NEXT: s_mov_b32 s11, 0x1100f000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: s_and_b32 s4, s4, 7
Expand All @@ -1688,18 +1690,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v9, s21
; VI-NEXT: v_mov_b32_e32 v10, s22
; VI-NEXT: v_mov_b32_e32 v11, s23
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; VI-NEXT: v_or_b32_e32 v16, s4, v16
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
; VI-NEXT: s_mov_b32 s11, 0x1100f000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/ipra.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ define hidden void @func() #1 {
; GCN-NOT: writelane
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8

; GCN: ; NumSgprs: 38
; GCN: ; NumSgprs: 37
; GCN: ; NumVgprs: 9
define amdgpu_kernel void @kernel_call() #0 {
%vgpr = load volatile i32, i32 addrspace(1)* undef
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@
; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]]
; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]]

; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen

; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
; ALL: ; ScratchSize: 32772
Expand Down
42 changes: 25 additions & 17 deletions llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s

; ALL-LABEL: {{^}}large_alloca_pixel_shader:
; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s10, -1
; CI-DAG: s_mov_b32 s11, 0xe8f000
; VI-DAG: s_mov_b32 s11, 0xe80000
; GFX9-DAG: s_mov_b32 s11, 0xe00000
; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s6, -1

; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
; CI-DAG: s_mov_b32 s7, 0xe8f000
; VI-DAG: s_mov_b32 s7, 0xe80000
; GFX9-DAG: s_mov_b32 s7, 0xe00000

; GCN: s_add_u32 s4, s4, s0
; GCN: s_addc_u32 s5, s5, 0

; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen

; ALL: ; ScratchSize: 32772
define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
Expand All @@ -25,15 +29,19 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
}

; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s10, -1
; CI-DAG: s_mov_b32 s11, 0xe8f000
; VI-DAG: s_mov_b32 s11, 0xe80000
; GFX9-DAG: s_mov_b32 s11, 0xe00000

; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s6, -1

; CI-DAG: s_mov_b32 s7, 0xe8f000
; VI-DAG: s_mov_b32 s7, 0xe80000
; GFX9-DAG: s_mov_b32 s7, 0xe00000

; GCN: s_add_u32 s4, s4, s2
; GCN: s_addc_u32 s5, s5, 0

; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen

; ALL: ; ScratchSize: 32772
define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 {
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
; FIXME: Requires stack object to not assert
; GCN-LABEL: {{^}}test_ps:
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4
; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4
; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
; GCN-NEXT: s_waitcnt
; GCN-NEXT: ; return
Expand All @@ -18,7 +18,7 @@ define amdgpu_ps i32 @test_ps() #1 {

; GCN-LABEL: {{^}}test_cs:
; GCN: s_mov_b64 s[4:5], s[0:1]
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4
; GCN: s_load_dword s0, s[0:1], 0x0
define amdgpu_cs i32 @test_cs() #1 {
%alloca = alloca i32, addrspace(5)
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/load-hi16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -531,13 +531,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
; GCN: s_waitcnt
; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}}
; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
; GFX900: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
entry:
%load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
Expand All @@ -549,13 +549,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
; GCN: s_waitcnt
; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
entry:
%load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
Expand Down Expand Up @@ -649,13 +649,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GCN: s_waitcnt
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand All @@ -668,13 +668,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GCN: s_waitcnt
; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand All @@ -687,13 +687,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GCN: s_waitcnt
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/load-lo16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1303,7 +1303,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1312,7 +1312,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
Expand All @@ -1323,7 +1323,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
Expand All @@ -1342,7 +1342,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1351,7 +1351,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
Expand All @@ -1362,7 +1362,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %
; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
Expand All @@ -1381,7 +1381,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1390,7 +1390,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
Expand All @@ -1402,7 +1402,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
Expand Down Expand Up @@ -1504,7 +1504,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1513,7 +1513,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
Expand All @@ -1525,7 +1525,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
Expand All @@ -1545,7 +1545,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1554,7 +1554,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
Expand All @@ -1565,7 +1565,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in,
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
Expand All @@ -1585,7 +1585,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: global_store_dword v[0:1], v1, off
; GFX900-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1594,7 +1594,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
Expand All @@ -1607,7 +1607,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -447,8 +447,8 @@ entry:
}

; GCN-LABEL: {{^}}nontemporal_private_0:
; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
Expand All @@ -462,8 +462,8 @@ entry:
}

; GCN-LABEL: {{^}}nontemporal_private_1:
; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,8 @@ entry:
}

; GCN-LABEL: {{^}}nontemporal_private_0:
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_0
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
Expand All @@ -329,8 +329,8 @@ entry:
}

; GCN-LABEL: {{^}}nontemporal_private_1:
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}}
; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}}
; GFX10: .amdhsa_kernel nontemporal_private_1
; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
; GFX10CU: .amdhsa_workgroup_processor_mode 0
Expand Down
93 changes: 41 additions & 52 deletions llvm/test/CodeGen/AMDGPU/memory_clause.ll
Original file line number Diff line number Diff line change
Expand Up @@ -115,61 +115,50 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GCN-NEXT: v_add_u32_e32 v0, v0, v2
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], s33 offen
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], s33 offen offset:4
; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], s33 offen offset:8
; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], s33 offen offset:12
; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], s33 offen offset:16
; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], s33 offen offset:20
; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], s33 offen offset:24
; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], s33 offen offset:28
; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], s33 offen offset:32
; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], s33 offen offset:36
; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], s33 offen offset:40
; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], s33 offen offset:44
; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], s33 offen offset:48
; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], s33 offen offset:52
; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], s33 offen offset:56
; GCN-NEXT: v_add_u32_e32 v1, v1, v2
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen offset:60
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], s33 offen
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], s33 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], s33 offen offset:8
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], s33 offen offset:12
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], s33 offen offset:16
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], s33 offen offset:20
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], s33 offen offset:24
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], s33 offen offset:28
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], s33 offen offset:32
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], s33 offen offset:36
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], s33 offen offset:40
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], s33 offen offset:44
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], s33 offen offset:48
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], s33 offen offset:52
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], s33 offen offset:56
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen offset:60
; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20
; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24
; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28
; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32
; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36
; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40
; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44
; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48
; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52
; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56
; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60
; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/mesa3d.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
; GCN-DAG: s_mov_b32 s6, -1{{$}}
; GCN-DAG: s_mov_b32 s7, 0xe8f000
; GCN-DAG: v_mov_b32_e32 [[V:v[0-9]+]], 2
; GCN: buffer_store_dword [[V]], off, s[4:7], s2 offset:4
; GCN: buffer_store_dword [[V]], off, s[4:7], 0 offset:4
define amdgpu_ps void @scratch_ps(i32 addrspace(1)* %out, i32 %in) {
entry:
%alloca = alloca i32, addrspace(5)
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ frameInfo:
maxAlignment: 4
machineFunctionInfo:
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
scratchWaveOffsetReg: '$sgpr4'
frameOffsetReg: '$sgpr5'
stackPtrOffsetReg: '$sgpr32'
body: |
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/misched-killflags.mir
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ tracksRegLiveness: true
machineFunctionInfo:
isEntryFunction: true
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
scratchWaveOffsetReg: '$sgpr7'
frameOffsetReg: '$sgpr7'
body: |
bb.0:
Expand Down
Loading