79 changes: 43 additions & 36 deletions llvm/test/CodeGen/AMDGPU/ds_write2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:8
; CI-NEXT: s_endpgm
;
Expand All @@ -62,8 +63,9 @@ define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, flo
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -92,12 +94,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
Expand All @@ -106,11 +108,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down Expand Up @@ -138,12 +140,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
; CI-NEXT: s_mov_b64 s[0:1], s[4:5]
; CI-NEXT: s_mov_b64 s[4:5], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: ds_write_b32 v0, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write_b32 v0, v1 offset:32
; CI-NEXT: s_endpgm
;
Expand All @@ -152,11 +154,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Expand Down Expand Up @@ -186,11 +188,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8
; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: ds_write2_b32 v0, v3, v2 offset1:8
; CI-NEXT: s_endpgm
;
Expand All @@ -200,9 +203,9 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -308,10 +311,11 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v2, v1 offset1:255
; CI-NEXT: s_endpgm
;
Expand All @@ -320,8 +324,9 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:255
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -730,10 +735,11 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:8
; CI-NEXT: s_endpgm
;
Expand All @@ -742,8 +748,9 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
; GFX9-NEXT: s_endpgm
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)*
; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0

; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4

; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}}
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 glc{{$}}


; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
Expand All @@ -56,10 +56,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)*
; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0

; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4

; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4

; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
Expand All @@ -84,10 +84,10 @@ define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrsp
; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0

; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8

; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}}
; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8

; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: v_mov_b32_e32 v2, 9
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: BB1_2: ; %bb1
; GFX7-NEXT: v_mov_b32_e32 v0, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 10
; GFX7-NEXT: v_mov_b32_e32 v1, 0
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_co_br_user:
Expand All @@ -113,11 +115,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX9-NEXT: v_mov_b32_e32 v2, 9
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: BB1_2: ; %bb1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 10
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_co_br_user:
Expand All @@ -137,11 +141,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: v_mov_b32_e32 v2, 9
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: BB1_2: ; %bb1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 10
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
bb:
%i1 = add i32 %i, %i
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/extload-private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ entry:
}

; FUNC-LABEL: {{^}}load_i16_zext_private:
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}}
; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 glc{{$}}
define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
entry:
%tmp0 = alloca i16, addrspace(5)
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s2
; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
entry:
Expand All @@ -130,10 +130,10 @@ define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s2
; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dword v2, off, s[4:7], 0
; GCN-NEXT: s_endpgm
entry:
Expand All @@ -155,11 +155,11 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s12, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
Expand Down Expand Up @@ -267,12 +271,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
Expand Down Expand Up @@ -305,12 +313,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MUL]], [[Z]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
Expand Down Expand Up @@ -344,12 +356,16 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #
; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[MUL]]
; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)

; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)

; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
; GCN-SLOWFMA: v_add_f32_e32
Expand Down
13 changes: 8 additions & 5 deletions llvm/test/CodeGen/AMDGPU/flat-address-space.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,28 +201,31 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
}

; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
%val = load volatile i8, i8* %fptr.offset
ret void
}

; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
%val = load volatile i8, i8* %fptr.offset
ret void
}

; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}

; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
%fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
%val = load volatile i8, i8* %fptr.offset
Expand Down
468 changes: 294 additions & 174 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Large diffs are not rendered by default.

90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/fma-combine.ll

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
declare double @llvm.maxnum.f64(double, double) nounwind readnone

; SI-LABEL: {{^}}test_fmax3_f64:
; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}}
; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrsp
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -65,7 +67,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -168,7 +172,9 @@ define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, fl
; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -196,7 +202,9 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -308,7 +316,9 @@ define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]

; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -806,7 +816,9 @@ define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float
; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1042,7 +1054,9 @@ define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float
; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1111,7 +1125,9 @@ define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrsp
; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1142,7 +1158,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]

; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1349,7 +1367,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)*

; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1434,7 +1454,9 @@ define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -1859,7 +1881,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float
; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand All @@ -1881,7 +1905,9 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr
; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -2316,7 +2342,9 @@ define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)
; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0

; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -2348,7 +2376,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %o
; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -2379,7 +2409,9 @@ define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %o
; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -2418,7 +2450,9 @@ define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)*
; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]

; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down Expand Up @@ -2453,7 +2487,9 @@ define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(
; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]

; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {

; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
; GCN: s_waitcnt
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}}
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc{{$}}
; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off glc{{$}}
define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
%val = load volatile i32, i32 addrspace(5)* %ptr
ret void
Expand Down Expand Up @@ -162,11 +162,11 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
; GCN: s_and_saveexec_b64

; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}

; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4{{$}}
; GFX9-MUBUF: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 glc{{$}}
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}

; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/function-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -537,10 +537,10 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8,
}

; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}}
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}}
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}}
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}}
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}}

; GCN: ds_write_b32 v0, v0
; GCN: s_setpc_b64
Expand Down
48 changes: 30 additions & 18 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand All @@ -168,7 +169,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand All @@ -190,7 +190,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand All @@ -199,7 +200,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand All @@ -225,7 +225,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand All @@ -234,7 +235,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand All @@ -256,7 +256,8 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand All @@ -265,7 +266,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
Expand Down Expand Up @@ -350,7 +350,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off
; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand Down Expand Up @@ -378,7 +379,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off
; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand Down Expand Up @@ -410,7 +412,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand Down Expand Up @@ -438,7 +441,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand Down Expand Up @@ -529,7 +533,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand Down Expand Up @@ -557,7 +562,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand Down Expand Up @@ -589,7 +595,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_writelane_b32 v40, s33, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s33, s32
Expand Down Expand Up @@ -617,7 +624,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_writelane_b32 v40, s33, 2
; GFX10-NEXT: s_mov_b32 s33, s32
; GFX10-NEXT: s_add_u32 s32, s32, 0x200
Expand Down Expand Up @@ -2877,6 +2885,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
; GFX9-NEXT: v_mov_b32_e32 v41, v1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: global_store_dword v[40:41], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s4, v42, 0
Expand Down Expand Up @@ -2912,6 +2921,7 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
; GFX10-NEXT: v_writelane_b32 v42, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: global_store_dword v[40:41], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4
Expand All @@ -2924,7 +2934,6 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %o
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[4:5]
%val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
store volatile i32 %val, i32 addrspace(1)* %out
Expand Down Expand Up @@ -3105,7 +3114,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX9-NEXT: v_readlane_b32 s33, v40, 2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
Expand Down Expand Up @@ -3145,13 +3156,14 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
; GFX10-NEXT: v_readlane_b32 s33, v40, 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_or_saveexec_b32 s6, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s6
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[4:5]
%in.val = alloca { i8, i32 }, align 4, addrspace(5)
%out.val = alloca { i8, i32 }, align 4, addrspace(5)
Expand Down
45 changes: 45 additions & 0 deletions llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_f16_tfe_dmask0:
Expand All @@ -37,7 +39,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask0:
Expand All @@ -55,7 +59,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { half, i32 } %v, 0
Expand All @@ -81,7 +87,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_f16_tfe_dmask1:
Expand All @@ -99,7 +107,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_f16_tfe_dmask1:
Expand All @@ -117,7 +127,9 @@ define amdgpu_ps void @load_1d_f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v1
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { half, i32 } @llvm.amdgcn.image.load.1d.sl_f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { half, i32 } %v, 0
Expand All @@ -143,7 +155,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask0:
Expand All @@ -161,7 +175,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask0:
Expand All @@ -179,7 +195,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
Expand All @@ -205,7 +223,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask1:
Expand All @@ -223,7 +243,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask1:
Expand All @@ -241,7 +263,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask1(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x1 unorm tfe d16
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v1
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v2
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
Expand All @@ -267,7 +291,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v2f16_tfe_dmask3:
Expand All @@ -285,7 +311,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:2], v0, s[4:11] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v2f16_tfe_dmask3:
Expand All @@ -306,7 +334,9 @@ define amdgpu_ps void @load_1d_v2f16_tfe_dmask3(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v3
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <2 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v2f16i32s.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <2 x half>, i32 } %v, 0
Expand All @@ -333,8 +363,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v3f16_tfe_dmask7:
Expand All @@ -353,8 +386,11 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v1, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v3, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
Expand All @@ -375,9 +411,12 @@ define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <3 x half>, i32 } %v, 0
Expand All @@ -404,7 +443,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf unorm tfe d16
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: load_1d_v4f16_tfe_dmask15:
Expand All @@ -423,7 +464,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[1:2], off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_dword v[0:1], v3, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask15:
Expand All @@ -448,7 +491,9 @@ define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s)
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2]
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5
; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
; GFX8-UNPACKED-NEXT: s_endpgm
%v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
%v.data = extractvalue { <4 x half>, i32 } %v, 0
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AMDGPU/imm16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; VI-LABEL: store_inline_imm_neg_0.0_i16:
Expand All @@ -24,6 +25,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; SI-LABEL: store_inline_imm_neg_0.0_i16:
Expand All @@ -34,6 +36,7 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out)
; SI-NEXT: v_mov_b32_e32 v0, 0x8000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
store volatile i16 -32768, i16 addrspace(1)* %out
ret void
Expand Down
5 changes: 5 additions & 0 deletions llvm/test/CodeGen/AMDGPU/infinite-loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_branch BB0_1
; IR-LABEL: @infinite_loop(
; IR-NEXT: entry:
Expand Down Expand Up @@ -45,6 +46,7 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB1_2
; SI-NEXT: BB1_3: ; %UnifiedReturnBlock
Expand Down Expand Up @@ -87,6 +89,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccnz BB2_2
; SI-NEXT: ; %bb.3: ; %Flow
Expand All @@ -105,6 +108,7 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
; SI-NEXT: BB2_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccz BB2_6
; SI-NEXT: BB2_7: ; %DummyReturnBlock
Expand Down Expand Up @@ -156,6 +160,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
; SI-NEXT: s_cbranch_execnz BB3_3
; SI-NEXT: ; %bb.4: ; %loop.exit.guard
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/inline-asm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ entry:
; FIXME: Should prodbably be masking high bits of load.
; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
; CHECK: buffer_load_ubyte v0
; CHECK-NEXT: s_waitcnt
; CHECK-NEXT: buffer_load_ubyte v1
; CHECK-NEXT: s_waitcnt
; CHECK-NEXT: ASMSTART
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1577,15 +1577,15 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
Expand All @@ -1604,7 +1604,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
Expand All @@ -1614,7 +1615,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: s_lshl_b32 s0, s1, 16
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -1632,7 +1632,8 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: s_mov_b32 s2, 0xffff
Expand All @@ -1642,7 +1643,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_or_b32 s0, s4, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
; CI-NEXT: s_waitcnt vmcnt(0)
Expand Down
115 changes: 65 additions & 50 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -121,12 +122,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
Expand All @@ -137,8 +139,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand All @@ -165,9 +168,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -180,11 +183,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
Expand All @@ -194,7 +197,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
Expand All @@ -219,9 +222,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -234,11 +237,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
Expand All @@ -248,7 +251,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
Expand All @@ -275,10 +278,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -295,12 +299,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
Expand All @@ -311,8 +316,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down Expand Up @@ -342,10 +348,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -362,12 +369,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
Expand All @@ -378,8 +386,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down Expand Up @@ -409,10 +418,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -429,12 +439,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
Expand All @@ -445,8 +456,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down Expand Up @@ -477,10 +489,11 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_endpgm
Expand All @@ -497,12 +510,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v1, v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
Expand All @@ -513,8 +527,9 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspac
}

; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc:
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}

; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ define amdgpu_kernel void @maxnum_f16(
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
Expand All @@ -50,13 +50,13 @@ define amdgpu_kernel void @maxnum_f16(
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand All @@ -75,13 +75,13 @@ define amdgpu_kernel void @maxnum_f16(
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
Expand All @@ -50,13 +50,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand All @@ -75,13 +75,13 @@ define amdgpu_kernel void @minnum_f16_ieee(
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
; GFX9-NEXT: v_min_f16_e32 v0, v0, v1
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
Expand Down
44 changes: 28 additions & 16 deletions llvm/test/CodeGen/AMDGPU/load-hi16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -536,13 +536,13 @@ entry:
; GCN: s_waitcnt
; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}}
; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}}
; GFX900: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
entry:
%load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
Expand All @@ -554,15 +554,15 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}}
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}}
; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}}
define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
entry:
%load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
Expand Down Expand Up @@ -660,15 +660,15 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand All @@ -681,15 +681,15 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}}
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}}
; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand All @@ -702,15 +702,15 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
; GCN: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}}
; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
; GFX900-NEXT: s_waitcnt
; GFX900-NEXT: s_setpc_b64

; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}}
; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}}
define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
entry:
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
Expand Down Expand Up @@ -805,9 +805,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
entry:
%obj0 = alloca [10 x i32], align 4, addrspace(5)
Expand All @@ -824,9 +828,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
entry:
%obj0 = alloca [10 x i32], align 4, addrspace(5)
Expand All @@ -844,9 +852,13 @@ entry:

; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
; GFX900-MUBUF: buffer_store_dword
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR: scratch_store_dword
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0)
define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
entry:
%obj0 = alloca [10 x i32], align 4, addrspace(5)
Expand Down Expand Up @@ -997,8 +1009,8 @@ entry:
; FIXME: Is there a cost to using the extload over not?
; GCN-LABEL: {{^}}load_private_v2i16_split:
; GCN: s_waitcnt
; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32{{$}}
; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}}
; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32 glc{{$}}
; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}}
; GFX900-NEXT: s_waitcnt
; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
Expand Down
Loading