63 changes: 32 additions & 31 deletions llvm/test/CodeGen/AMDGPU/ds_read2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,11 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out,
; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v2, v2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -374,11 +374,11 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)
; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: ds_read_b32 v1, v1 offset:32
; GFX9-NEXT: ds_read_b32 v2, v2 offset:32
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -567,10 +567,10 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
; GFX9-ALIGNED-LABEL: unaligned_read2_f32:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0
; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1
; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:1
; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:2
Expand All @@ -596,10 +596,10 @@ define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float a
; GFX9-UNALIGNED-LABEL: unaligned_read2_f32:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2
; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
Expand Down Expand Up @@ -661,10 +661,10 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0
; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5
; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v1 offset:6
; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:7
Expand All @@ -690,10 +690,10 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5
; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s4, v2, 5
; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
Expand Down Expand Up @@ -745,10 +745,10 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou
; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s0, v0
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0
; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1
; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2
; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32
Expand All @@ -764,10 +764,10 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou
; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s4, v2
; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:8
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
Expand Down Expand Up @@ -1210,10 +1210,10 @@ define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out,
; GFX9-LABEL: misaligned_read2_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
Expand Down Expand Up @@ -1241,10 +1241,10 @@ define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addr
; GFX9-LABEL: misaligned_read2_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
Expand Down Expand Up @@ -1288,16 +1288,16 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
;
; GFX9-LABEL: ds_read_diff_base_interleaving:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, s0, v1
; GFX9-NEXT: v_add_u32_e32 v3, s1, v0
; GFX9-NEXT: v_add_u32_e32 v4, s2, v1
; GFX9-NEXT: v_add_u32_e32 v6, s3, v0
; GFX9-NEXT: v_add_u32_e32 v2, s4, v1
; GFX9-NEXT: v_add_u32_e32 v3, s5, v0
; GFX9-NEXT: v_add_u32_e32 v4, s6, v1
; GFX9-NEXT: v_add_u32_e32 v6, s7, v0
; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
; GFX9-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
Expand All @@ -1312,7 +1312,7 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX9-NEXT: global_store_dword v8, v0, s[4:5] offset:40
; GFX9-NEXT: global_store_dword v8, v0, s[2:3] offset:40
; GFX9-NEXT: s_endpgm
float addrspace(1)* nocapture %arg,
[4 x [4 x float]] addrspace(3)* %arg1,
Expand Down Expand Up @@ -1388,17 +1388,18 @@ define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspa
; GFX9-NEXT: s_getpc_b64 s[36:37]
; GFX9-NEXT: s_mov_b32 s36, s0
; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: v_mov_b32_e32 v40, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s36, s36, s3
; GFX9-NEXT: s_addc_u32 s37, s37, 0
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12
; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX9-NEXT: ds_read_b32 v42, v41
; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/AMDGPU/ds_write2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,14 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa
; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] glc
; GFX9-NEXT: global_load_dwordx2 v[1:2], v5, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v3, s[0:1] offset:8 glc
; GFX9-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
Expand Down Expand Up @@ -528,17 +528,17 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
; GFX9-LABEL: write2_ptr_subreg_arg_two_val_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v2, v1 offset:32
; GFX9-NEXT: ds_write_b32 v0, v1 offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v3, v0 offset:32
; GFX9-NEXT: ds_write_b32 v3, v2 offset:32
; GFX9-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
Expand Down Expand Up @@ -617,11 +617,11 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace
; GFX9-LABEL: misaligned_simple_write2_one_val_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v2, s0, v2
; GFX9-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:14 offset1:15
Expand Down Expand Up @@ -674,11 +674,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
; GFX9-ALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s0, v2
; GFX9-ALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-ALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX9-ALIGNED-NEXT: ds_write_b8_d16_hi v2, v0 offset:7
Expand All @@ -701,11 +701,11 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
; GFX9-UNALIGNED-LABEL: unaligned_offset_simple_write2_one_val_f64:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -976,10 +976,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
Expand All @@ -992,10 +992,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1032 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1032 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1033 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1033 %s

; FIXME: With the default attributes the eflags are not accurate for
; xnack and sramecc. Subsequent Target-ID patches will address this.

; ARCH-R600: Format: elf32-amdgpu
; ARCH-R600: Arch: r600
; ARCH-R600: AddressSize: 32bit
Expand Down Expand Up @@ -96,19 +99,15 @@
; GFX704: EF_AMDGPU_MACH_AMDGCN_GFX704 (0x26)
; GFX705: EF_AMDGPU_MACH_AMDGCN_GFX705 (0x3B)
; GFX801: EF_AMDGPU_MACH_AMDGCN_GFX801 (0x28)
; GFX801-NEXT: EF_AMDGPU_XNACK (0x100)
; GFX802: EF_AMDGPU_MACH_AMDGCN_GFX802 (0x29)
; GFX803: EF_AMDGPU_MACH_AMDGCN_GFX803 (0x2A)
; GFX805: EF_AMDGPU_MACH_AMDGCN_GFX805 (0x3C)
; GFX810: EF_AMDGPU_MACH_AMDGCN_GFX810 (0x2B)
; GFX810-NEXT: EF_AMDGPU_XNACK (0x100)
; GFX900: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C)
; GFX902: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
; GFX902-NEXT: EF_AMDGPU_XNACK (0x100)
; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30)
; GFX908-NEXT: EF_AMDGPU_SRAM_ECC (0x200)
; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
; GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32)
; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
Expand Down
23 changes: 4 additions & 19 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
Original file line number Diff line number Diff line change
@@ -1,24 +1,9 @@
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s

; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s

; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s

; NO-SRAM-ECC-GFX902: Flags [
; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
; NO-SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100)
; NO-SRAM-ECC-GFX902-NEXT: ]
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s

; SRAM-ECC-GFX902: Flags [
; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_SRAM_ECC (0x200)
; SRAM-ECC-GFX902-NEXT: EF_AMDGPU_XNACK (0x100)
; SRAM-ECC-GFX902-NEXT: ]
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 -mattr=+sramecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefix=SRAM-ECC-GFX908 %s

; NO-SRAM-ECC-GFX906: Flags [
; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
%p.0 = load i16, i16 addrspace(1)* %p, align 2
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen
; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
; GFX9-FLASTSCR: ; %bb.0:
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v0, v0, off offset:2
; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2
; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1
; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 2
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s

; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s

; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s

; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo --amdhsa-code-object-version=2 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/flat-scratch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
; GFX9-PAL-NEXT: s_mov_b32 s4, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
Expand Down Expand Up @@ -825,8 +825,8 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
; GFX9-PAL-NEXT: s_mov_b32 s4, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
Expand Down Expand Up @@ -1456,8 +1456,8 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
; GFX9-PAL-NEXT: s_mov_b32 s4, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
Expand Down Expand Up @@ -2013,8 +2013,8 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
; GFX9-PAL-NEXT: s_mov_b32 s4, s0
; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
Expand Down
224 changes: 112 additions & 112 deletions llvm/test/CodeGen/AMDGPU/frem.ll

Large diffs are not rendered by default.

90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/fshl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
; GFX9-NEXT: s_not_b32 s1, s6
; GFX9-NEXT: s_lshr_b32 s0, s4, 1
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -106,11 +106,11 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
; GFX9-LABEL: fshl_i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 25
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 25
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -185,21 +185,21 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s0, s5, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: s_not_b32 s1, s1
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1
; GFX9-NEXT: s_not_b32 s1, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s5, s5, 1
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1
; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: s_not_b32 s1, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v3
; GFX9-NEXT: s_lshr_b32 s0, s4, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -261,11 +261,11 @@ define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23
; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
Expand Down Expand Up @@ -365,37 +365,37 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; GFX9-LABEL: fshl_v4i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s0, s7, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: s_not_b32 s3, s3
; GFX9-NEXT: s_not_b32 s1, s15
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_lshr_b32 s7, s7, 1
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: s_not_b32 s2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: s_not_b32 s1, s14
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1
; GFX9-NEXT: s_lshr_b32 s3, s6, 1
; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1
; GFX9-NEXT: s_lshr_b32 s0, s6, 1
; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: s_not_b32 s1, s1
; GFX9-NEXT: s_not_b32 s1, s13
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1
; GFX9-NEXT: s_lshr_b32 s2, s5, 1
; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1
; GFX9-NEXT: s_lshr_b32 s0, s5, 1
; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: s_not_b32 s0, s0
; GFX9-NEXT: s_not_b32 s1, s12
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: s_lshr_b32 s1, s4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, s0
; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: s_lshr_b32 s0, s4, 1
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32:
Expand Down Expand Up @@ -470,20 +470,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
;
; GFX9-LABEL: fshl_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
Expand Down
50 changes: 25 additions & 25 deletions llvm/test/CodeGen/AMDGPU/fshr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -103,11 +103,11 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
; GFX9-LABEL: fshr_i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 7
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 7
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -170,14 +170,14 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -236,11 +236,11 @@ define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32>
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9
; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
Expand Down Expand Up @@ -316,25 +316,25 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
;
; GFX9-LABEL: fshr_v4i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s15
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s14
; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s0
; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13]
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32:
Expand Down Expand Up @@ -401,20 +401,20 @@ define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32>
;
; GFX9-LABEL: fshr_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s9
; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32_imm:
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/function-returns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ define <4 x half> @v4f16_func_void() #0 {
; FIXME: Should not scalarize
; GCN-LABEL: {{^}}v5i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1]
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_short_d16 v2
; GFX9-NEXT: s_waitcnt
; GFX9-NEXT: s_setpc_b64
Expand Down
151 changes: 76 additions & 75 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s
# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK %s

# GCN-LABEL: name: break_smem_clause_simple_load_smrd8_ptr_hidden_bundle
# GCN: bb.0:
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s
# RUN: llc -march=amdgcn -mcpu=gfx902 -mattr=+xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,XNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX9 %s
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64,-xnack -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefixes=GCN,NOXNACK,GFX10 %s

# GCN-LABEL: name: break_smem_clause_max_look_ahead_in_bundle
# GCN: S_LOAD_DWORDX2_IMM
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX700,WAVE64 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX803,WAVE64 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX900,WAVE64 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX1010,WAVE32 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX700,WAVE64 %s
; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX803,WAVE64 %s
; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX900,WAVE64 %s
; RUN: llc -mattr=-xnack -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=3 -mcpu=gfx1010 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes - | FileCheck --check-prefixes=CHECK,GFX1010,WAVE32 %s

@var = addrspace(1) global float 0.0

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX700 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX803 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX900 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX803 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-xnack -enable-misched=0 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefixes=CHECK,GFX900 %s

@var = addrspace(1) global float 0.0

Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/idiv-licm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-LABEL: udiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: s_sub_i32 s3, 0, s2
Expand Down Expand Up @@ -64,8 +64,8 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-LABEL: urem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: s_sub_i32 s3, 0, s2
Expand Down Expand Up @@ -121,6 +121,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-LABEL: sdiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s3, 31
Expand Down Expand Up @@ -180,6 +181,7 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-LABEL: srem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s3, s2, 31
Expand Down
1,394 changes: 697 additions & 697 deletions llvm/test/CodeGen/AMDGPU/idot2.ll

Large diffs are not rendered by default.

434 changes: 217 additions & 217 deletions llvm/test/CodeGen/AMDGPU/idot4s.ll

Large diffs are not rendered by default.

1,012 changes: 506 additions & 506 deletions llvm/test/CodeGen/AMDGPU/idot4u.ll

Large diffs are not rendered by default.

1,475 changes: 1,099 additions & 376 deletions llvm/test/CodeGen/AMDGPU/idot8s.ll

Large diffs are not rendered by default.

1,492 changes: 748 additions & 744 deletions llvm/test/CodeGen/AMDGPU/idot8u.ll

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/immv216.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
; FIXME: Merge into imm.ll

; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() {
; GCN-NEXT: is_ptr64 = 1
; GCN-NEXT: is_dynamic_callstack = 1
; GCN-NEXT: is_debug_enabled = 0
; GCN-NEXT: is_xnack_enabled = 1
; GCN-NEXT: is_xnack_enabled = 0
; GCN-NEXT: workitem_private_segment_byte_size = 16384
; GCN-NEXT: workgroup_group_segment_byte_size = 0
; GCN-NEXT: gds_segment_byte_size = 0
Expand Down Expand Up @@ -149,7 +149,7 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() {
; GCN-NEXT: is_ptr64 = 1
; GCN-NEXT: is_dynamic_callstack = 1
; GCN-NEXT: is_debug_enabled = 0
; GCN-NEXT: is_xnack_enabled = 1
; GCN-NEXT: is_xnack_enabled = 0
; GCN-NEXT: workitem_private_segment_byte_size = 16384
; GCN-NEXT: workgroup_group_segment_byte_size = 0
; GCN-NEXT: gds_segment_byte_size = 0
Expand Down
88 changes: 44 additions & 44 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
; GFX9-LABEL: s_insertelement_v2i16_0_reg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -91,13 +91,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: ;;#ASMSTART
Expand Down Expand Up @@ -157,12 +157,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2
; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -210,11 +210,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_lshr_b32 s3, s4, 16
; GFX9-NEXT: s_lshr_b32 s3, s6, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
Expand Down Expand Up @@ -275,11 +275,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i
; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_lshr_b32 s3, s4, 16
; GFX9-NEXT: s_lshr_b32 s3, s6, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2
Expand Down Expand Up @@ -387,12 +387,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
; GFX9-LABEL: s_insertelement_v2i16_1_reg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -562,12 +562,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4
; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
Expand Down Expand Up @@ -1039,17 +1039,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; GFX9-LABEL: s_insertelement_v2i16_dynamic:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, s4, 4
; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3
; GFX9-NEXT: s_andn2_b32 s2, s2, s3
; GFX9-NEXT: s_and_b32 s3, s3, 0x3e703e7
; GFX9-NEXT: s_or_b32 s2, s3, s2
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
; GFX9-NEXT: s_andn2_b32 s3, s5, s2
; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -1102,12 +1102,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_lshl_b32 s2, s4, 4
; GFX9-NEXT: s_lshl_b32 s2, s6, 4
; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
Expand Down Expand Up @@ -1169,10 +1169,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
Expand Down Expand Up @@ -1252,13 +1252,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; GFX9-LABEL: v_insertelement_v4f16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0
; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1317,13 +1317,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
; GFX9-LABEL: v_insertelement_v4f16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1382,13 +1382,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; GFX9-LABEL: v_insertelement_v4f16_2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1
; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1447,13 +1447,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
; GFX9-LABEL: v_insertelement_v4f16_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1512,13 +1512,13 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,
; GFX9-LABEL: v_insertelement_v4i16_2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1
; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
Expand Down Expand Up @@ -1580,15 +1580,15 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1
; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0
Expand Down Expand Up @@ -1667,17 +1667,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_lshl_b32 s5, s5, 4
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: s_lshl_b32 s4, s7, 4
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1
; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0
Expand Down
156 changes: 78 additions & 78 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
; SI-LABEL: s_cvt_pkrtz_v2f16_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_cvt_pkrtz_v2f16_f32:
Expand All @@ -31,11 +31,11 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, v1
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
Expand All @@ -46,13 +46,13 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s2, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s4, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
Expand All @@ -69,10 +69,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s0, s0
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s4, s4
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
Expand All @@ -92,22 +92,22 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
; SI-LABEL: v_cvt_pkrtz_v2f16_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
Expand Down Expand Up @@ -136,12 +136,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down Expand Up @@ -269,22 +269,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
Expand Down Expand Up @@ -313,12 +313,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand All @@ -339,22 +339,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
Expand Down Expand Up @@ -383,12 +383,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand All @@ -409,22 +409,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
Expand Down Expand Up @@ -453,12 +453,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand All @@ -480,22 +480,22 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[10:11]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[10:11], s[2:3]
; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
Expand Down Expand Up @@ -524,12 +524,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
Expand Down
20 changes: 11 additions & 9 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ main_body:
}

; GCN-LABEL: {{^}}sample_contig_nsa:
; GCN: image_sample_c_l v0, v[0:7],
; NSA: image_sample v1, [v6, v7, v5],
; NONSA: image_sample_c_l v5, v[0:7],
; NSA: image_sample_c_l v8, v[0:7],
; NSA: image_sample v9, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -44,8 +45,8 @@ main_body:
}

; GCN-LABEL: {{^}}sample_nsa_nsa:
; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
; NSA: image_sample v1, [v6, v7, v5],
; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
; NSA: image_sample v9, [v6, v7, v5],
define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -56,8 +57,8 @@ main_body:
}

; GCN-LABEL: {{^}}sample_nsa_contig:
; NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
; NSA: image_sample v1, v[5:7],
; NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
; NSA: image_sample v9, v[5:7],
define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand All @@ -68,9 +69,10 @@ main_body:
}

; GCN-LABEL: {{^}}sample_contig_contig:
; GCN: image_sample_c_l v0, v[0:7],
; NSA: image_sample v1, v[5:7],
; NONSA: image_sample v1, v[5:7],
; NSA: image_sample_c_l v8, v[0:7],
; NSA: image_sample v9, v[5:7],
; NONSA: image_sample_c_l v8, v[0:7],
; NONSA: image_sample v9, v[5:7],
define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
main_body:
%v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT0-LABEL: test_barrier:
; VARIANT0: ; %bb.0: ; %entry
; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT0-NEXT: s_load_dword s0, s[0:1], 0xb
; VARIANT0-NEXT: s_mov_b32 s7, 0xf000
; VARIANT0-NEXT: s_mov_b32 s6, 0
; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0
Expand All @@ -18,7 +18,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT0-NEXT: s_barrier
; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s0, v3
; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
Expand All @@ -29,7 +29,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT1-LABEL: test_barrier:
; VARIANT1: ; %bb.0: ; %entry
; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT1-NEXT: s_load_dword s0, s[0:1], 0xb
; VARIANT1-NEXT: s_mov_b32 s7, 0xf000
; VARIANT1-NEXT: s_mov_b32 s6, 0
; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0
Expand All @@ -38,7 +38,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT1-NEXT: s_barrier
; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s0, v3
; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT1-NEXT: s_waitcnt expcnt(0)
Expand All @@ -50,11 +50,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT2-LABEL: test_barrier:
; VARIANT2: ; %bb.0: ; %entry
; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c
; VARIANT2-NEXT: s_load_dword s4, s[0:1], 0x2c
; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3]
; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0
; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s4
; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VARIANT2-NEXT: v_mov_b32_e32 v3, s3
Expand All @@ -70,11 +70,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT3-LABEL: test_barrier:
; VARIANT3: ; %bb.0: ; %entry
; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c
; VARIANT3-NEXT: s_load_dword s4, s[0:1], 0x2c
; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3]
; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0
; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s4
; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VARIANT3-NEXT: v_mov_b32_e32 v3, s3
Expand Down
124 changes: 62 additions & 62 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
; SI-LABEL: bfe_u32_arg_arg_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_bfe_u32 v0, v0, s3, s3
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_bfe_u32 v0, v0, s5, s5
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bfe_u32_arg_arg_arg:
Expand All @@ -34,15 +34,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0
define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
; SI-LABEL: bfe_u32_arg_arg_imm:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x7b
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_bfe_u32 v0, s2, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_bfe_u32 v0, s4, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bfe_u32_arg_arg_imm:
Expand All @@ -65,15 +65,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0
define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
; SI-LABEL: bfe_u32_arg_imm_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x7b
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_bfe_u32 v0, s2, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_bfe_u32 v0, s4, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bfe_u32_arg_imm_arg:
Expand All @@ -96,16 +96,16 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0
define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
; SI-LABEL: bfe_u32_imm_arg_arg:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_movk_i32 s0, 0x7b
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_movk_i32 s6, 0x7b
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_bfe_u32 v0, s0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_bfe_u32 v0, s6, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bfe_u32_imm_arg_arg:
Expand Down Expand Up @@ -1590,13 +1590,13 @@ define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
; SI-LABEL: lshr_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s2, 0x30006
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bfe_u32 s4, s2, 0x30006
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: lshr_and:
Expand All @@ -1619,15 +1619,15 @@ define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: v_lshr_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s0, s2, s3
; SI-NEXT: s_and_b32 s0, s0, 7
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_lshr_b32 s2, s4, s5
; SI-NEXT: s_and_b32 s4, s2, 7
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_and:
Expand All @@ -1652,13 +1652,13 @@ define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
; SI-LABEL: and_lshr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s2, 0x30006
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bfe_u32 s4, s2, 0x30006
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: and_lshr:
Expand All @@ -1682,13 +1682,13 @@ define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
; SI-LABEL: and_lshr2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s2, 0x30006
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bfe_u32 s4, s2, 0x30006
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: and_lshr2:
Expand All @@ -1712,13 +1712,13 @@ define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
; SI-LABEL: shl_lshr:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s2, 0x150002
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bfe_u32 s4, s2, 0x150002
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: shl_lshr:
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,17 +292,17 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX9-LABEL: maxnum_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v1, s10, s10
; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -529,12 +529,12 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
; GFX9-NEXT: v_pk_max_f16 v2, s7, s7
; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_max_f16 v1, v1, v2
; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
Expand Down Expand Up @@ -643,12 +643,12 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
; GFX9-NEXT: v_pk_max_f16 v1, v1, v0
; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, v0, v2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -315,17 +315,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX9-LABEL: minnum_v2f16_ieee:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v1, s10, s10
; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -582,12 +582,12 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s10, s10
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
; GFX9-NEXT: v_pk_max_f16 v2, s7, s7
; GFX9-NEXT: v_pk_max_f16 v2, s11, s11
; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_min_f16 v1, v1, v2
; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
Expand Down Expand Up @@ -696,12 +696,12 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s5, s5
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v0, s11, s11
; GFX9-NEXT: v_pk_min_f16 v1, v1, v0
; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v2, s10, s10
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: v_pk_min_f16 v0, v0, v2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
Expand Down
Loading