110 changes: 50 additions & 60 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,25 +452,24 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_i8_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_i8_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s3
Expand Down Expand Up @@ -527,18 +526,19 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 a
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_unaligned:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; SI-NEXT: s_waitcnt vmcnt(2)
Expand All @@ -556,15 +556,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_unaligned:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -643,28 +641,27 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: i8_zext_inreg_i32_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_zext_inreg_i32_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -691,27 +688,26 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias
define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: i8_zext_inreg_hi1_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_zext_inreg_hi1_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -739,25 +735,24 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias
define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: i8_zext_i32_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: i8_zext_i32_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s3
Expand All @@ -782,18 +777,19 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out,
define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; SI-NEXT: s_waitcnt vmcnt(2)
Expand All @@ -811,15 +807,13 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -865,26 +859,25 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: extract_byte0_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: extract_byte0_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -909,27 +902,26 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out
define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: extract_byte1_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v0, v0, 8, 8
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: extract_byte1_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -955,27 +947,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out
define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: extract_byte2_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v0, v0, 16, 8
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: extract_byte2_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -1001,26 +992,25 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out
define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: extract_byte3_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: extract_byte3_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,10 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) {
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: ds_pk_add_f16 v0, v1
; GFX940-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,19 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
; GFX8V3-LABEL: addrspacecast:
; GFX8V3: ; %bb.0:
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x44
; GFX8V3-NEXT: s_load_dword s5, s[4:5], 0x40
; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40
; GFX8V3-NEXT: v_mov_b32_e32 v2, 1
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V3-NEXT: s_mov_b32 s2, s0
; GFX8V3-NEXT: s_mov_b32 s4, s0
; GFX8V3-NEXT: s_mov_b32 s5, s3
; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX8V3-NEXT: s_mov_b32 s4, s1
; GFX8V3-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8V3-NEXT: s_mov_b32 s6, s1
; GFX8V3-NEXT: s_mov_b32 s7, s2
; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V3-NEXT: v_mov_b32_e32 v0, s2
; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
; GFX8V3-NEXT: v_mov_b32_e32 v1, s3
; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
; GFX8V3-NEXT: flat_store_dword v[0:1], v2
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -35,18 +36,19 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
; GFX8V4-LABEL: addrspacecast:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x44
; GFX8V4-NEXT: s_load_dword s5, s[4:5], 0x40
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40
; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_mov_b32 s2, s0
; GFX8V4-NEXT: s_mov_b32 s4, s0
; GFX8V4-NEXT: s_mov_b32 s5, s3
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX8V4-NEXT: s_mov_b32 s4, s1
; GFX8V4-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8V4-NEXT: s_mov_b32 s6, s1
; GFX8V4-NEXT: s_mov_b32 s7, s2
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V4-NEXT: v_mov_b32_e32 v0, s2
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -59,18 +61,18 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
; GFX8V5-LABEL: addrspacecast:
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xc8
; GFX8V5-NEXT: s_load_dword s5, s[4:5], 0xcc
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8
; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_mov_b32 s2, s0
; GFX8V5-NEXT: s_mov_b32 s4, s0
; GFX8V5-NEXT: s_mov_b32 s5, s2
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
; GFX8V5-NEXT: s_mov_b32 s4, s1
; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
; GFX8V5-NEXT: s_mov_b32 s2, s1
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
Expand Down
168 changes: 78 additions & 90 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
Original file line number Diff line number Diff line change
Expand Up @@ -635,9 +635,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0
Expand All @@ -646,10 +644,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000
; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: v_mov_b32_e32 v3, s7
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
; GFX1030-NEXT: v_mov_b32_e32 v1, s1
; GFX1030-NEXT: v_mov_b32_e32 v2, s2
; GFX1030-NEXT: v_mov_b32_e32 v3, s3
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
Expand All @@ -660,26 +658,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000
; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
Expand All @@ -692,18 +688,15 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11]
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX11-NEXT: s_mov_b32 s8, 2.0
; GFX11-NEXT: s_mov_b32 s9, 0x40400000
; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
; GFX11-NEXT: s_mov_b32 s11, 0x40a00000
Expand All @@ -713,23 +706,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
; GFX11-NEXT: v_mov_b32_e32 v6, s12
; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_mov_b32 s1, 1.0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: flat_load_b32 v9, v[0:1]
; GFX11-NEXT: flat_load_b32 v10, v[2:3]
; GFX11-NEXT: s_mov_b32 s7, 1.0
; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9
; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s9
; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3]
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -756,126 +750,120 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr
define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_clause 0x1
; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX1030-NEXT: s_movk_i32 s9, 0x4600
; GFX1030-NEXT: s_movk_i32 s8, 0x4700
; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: v_mov_b32_e32 v2, s6
; GFX1030-NEXT: v_mov_b32_e32 v3, s7
; GFX1030-NEXT: s_movk_i32 s5, 0x4400
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
; GFX1030-NEXT: v_mov_b32_e32 v1, s1
; GFX1030-NEXT: v_mov_b32_e32 v2, s2
; GFX1030-NEXT: v_mov_b32_e32 v3, s3
; GFX1030-NEXT: s_movk_i32 s1, 0x4400
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1030-NEXT: s_movk_i32 s6, 0x4200
; GFX1030-NEXT: s_movk_i32 s2, 0x4200
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000
; GFX1030-NEXT: s_movk_i32 s7, 0x4800
; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000
; GFX1030-NEXT: s_lshl_b32 s5, s5, 16
; GFX1030-NEXT: s_movk_i32 s4, 0x4500
; GFX1030-NEXT: s_or_b32 s5, s6, s5
; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000
; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000
; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX1030-NEXT: s_lshl_b32 s6, s6, 16
; GFX1030-NEXT: s_lshl_b32 s7, s7, 16
; GFX1030-NEXT: s_or_b32 s4, s4, s6
; GFX1030-NEXT: s_or_b32 s6, s8, s7
; GFX1030-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX1030-NEXT: s_movk_i32 s3, 0x4800
; GFX1030-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX1030-NEXT: s_lshl_b32 s1, s1, 16
; GFX1030-NEXT: s_movk_i32 s0, 0x4500
; GFX1030-NEXT: s_or_b32 s1, s2, s1
; GFX1030-NEXT: s_bfe_u32 s2, s9, 0x100000
; GFX1030-NEXT: s_bfe_u32 s3, s3, 0x100000
; GFX1030-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX1030-NEXT: s_lshl_b32 s2, s2, 16
; GFX1030-NEXT: s_lshl_b32 s3, s3, 16
; GFX1030-NEXT: s_or_b32 s0, s0, s2
; GFX1030-NEXT: s_or_b32 s2, s8, s3
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v5, s5
; GFX1030-NEXT: v_mov_b32_e32 v6, s4
; GFX1030-NEXT: v_mov_b32_e32 v7, s6
; GFX1030-NEXT: v_mov_b32_e32 v5, s1
; GFX1030-NEXT: v_mov_b32_e32 v6, s0
; GFX1030-NEXT: v_mov_b32_e32 v7, s2
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_clause 0x1
; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GFX1013-NEXT: s_movk_i32 s9, 0x4600
; GFX1013-NEXT: s_movk_i32 s8, 0x4700
; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
; GFX1013-NEXT: s_movk_i32 s1, 0x4400
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1013-NEXT: s_movk_i32 s2, 0x4200
; GFX1013-NEXT: flat_load_dword v0, v[4:5]
; GFX1013-NEXT: flat_load_dword v1, v[2:3]
; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX1013-NEXT: s_movk_i32 s3, 0x4800
; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000
; GFX1013-NEXT: s_lshl_b32 s1, s1, 16
; GFX1013-NEXT: s_movk_i32 s0, 0x4500
; GFX1013-NEXT: s_or_b32 s1, s2, s1
; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000
; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000
; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
; GFX1013-NEXT: v_mov_b32_e32 v0, s4
; GFX1013-NEXT: v_mov_b32_e32 v1, s5
; GFX1013-NEXT: v_mov_b32_e32 v2, s6
; GFX1013-NEXT: v_mov_b32_e32 v3, s7
; GFX1013-NEXT: s_movk_i32 s5, 0x4600
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX1013-NEXT: s_movk_i32 s4, 0x4700
; GFX1013-NEXT: flat_load_dword v0, v[4:5]
; GFX1013-NEXT: flat_load_dword v1, v[2:3]
; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000
; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000
; GFX1013-NEXT: s_lshl_b32 s2, s2, 16
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
; GFX1013-NEXT: s_or_b32 s0, s0, s2
; GFX1013-NEXT: s_or_b32 s2, s4, s3
; GFX1013-NEXT: s_or_b32 s2, s8, s3
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
; GFX1013-NEXT: v_mov_b32_e32 v5, s1
; GFX1013-NEXT: v_mov_b32_e32 v6, s0
; GFX1013-NEXT: v_mov_b32_e32 v7, s2
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX11-NEXT: s_mov_b32 s8, 2.0
; GFX11-NEXT: s_mov_b32 s9, 0x42004600
; GFX11-NEXT: s_mov_b32 s10, 0x44004700
; GFX11-NEXT: s_mov_b32 s11, 0x45004800
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: s_mov_b32 s6, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_mov_b32 s1, 1.0
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: s_mov_b32 s7, 1.0
; GFX11-NEXT: flat_load_b32 v6, v[0:1]
; GFX11-NEXT: flat_load_b32 v7, v[2:3]
; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9
; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s9
; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
145 changes: 57 additions & 88 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll

Large diffs are not rendered by default.

152 changes: 60 additions & 92 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll

Large diffs are not rendered by default.

13 changes: 4 additions & 9 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s3
Expand All @@ -20,22 +19,18 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand Down
213 changes: 104 additions & 109 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -622,57 +622,56 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) {
; GFX8-LABEL: sdivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s10, 31
; GFX8-NEXT: s_add_i32 s0, s10, s2
; GFX8-NEXT: s_xor_b32 s3, s0, s2
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX8-NEXT: s_sub_i32 s1, 0, s3
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
; GFX8-NEXT: s_add_i32 s0, s11, s12
; GFX8-NEXT: s_ashr_i32 s10, s11, 31
; GFX8-NEXT: s_add_i32 s0, s11, s10
; GFX8-NEXT: s_xor_b32 s11, s0, s10
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_xor_b32 s11, s0, s12
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11
; GFX8-NEXT: s_ashr_i32 s10, s8, 31
; GFX8-NEXT: s_sub_i32 s0, 0, s3
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX8-NEXT: s_ashr_i32 s12, s8, 31
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: s_add_i32 s0, s8, s10
; GFX8-NEXT: s_xor_b32 s0, s0, s10
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0
; GFX8-NEXT: s_sub_i32 s8, 0, s11
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: s_sub_i32 s1, 0, s11
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX8-NEXT: s_add_i32 s0, s8, s12
; GFX8-NEXT: s_xor_b32 s0, s0, s12
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: s_xor_b32 s0, s10, s2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mul_lo_u32 v2, s1, v1
; GFX8-NEXT: v_mul_lo_u32 v3, v0, s3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
; GFX8-NEXT: s_xor_b32 s0, s12, s2
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_add_i32 s1, s9, s2
; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: s_xor_b32 s1, s1, s2
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX8-NEXT: v_xor_b32_e32 v2, s12, v3
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v2
; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s12, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
Expand All @@ -683,7 +682,7 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
; GFX8-NEXT: s_xor_b32 s0, s2, s12
; GFX8-NEXT: s_xor_b32 s0, s2, s10
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX8-NEXT: v_mov_b32_e32 v4, s4
Expand All @@ -699,150 +698,146 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
;
; GFX9-LABEL: sdivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s6, s10, 31
; GFX9-NEXT: s_add_i32 s0, s10, s6
; GFX9-NEXT: s_xor_b32 s7, s0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_i32 s5, s11, s4
; GFX9-NEXT: s_ashr_i32 s8, s6, 31
; GFX9-NEXT: s_add_i32 s6, s6, s8
; GFX9-NEXT: s_xor_b32 s6, s6, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s9, s7, 31
; GFX9-NEXT: s_add_i32 s7, s7, s9
; GFX9-NEXT: s_xor_b32 s7, s7, s9
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_xor_b32 s5, s5, s4
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5
; GFX9-NEXT: s_sub_i32 s11, 0, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_sub_i32 s12, 0, s6
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: s_ashr_i32 s10, s8, 31
; GFX9-NEXT: s_add_i32 s8, s8, s10
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX9-NEXT: s_add_i32 s4, s4, s10
; GFX9-NEXT: s_xor_b32 s4, s4, s10
; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: s_xor_b32 s8, s8, s10
; GFX9-NEXT: s_sub_i32 s12, 0, s7
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: s_sub_i32 s11, 0, s5
; GFX9-NEXT: v_mul_lo_u32 v3, s11, v1
; GFX9-NEXT: s_ashr_i32 s11, s9, 31
; GFX9-NEXT: s_ashr_i32 s11, s5, 31
; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
; GFX9-NEXT: s_add_i32 s5, s5, s11
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
; GFX9-NEXT: s_add_i32 s9, s9, s11
; GFX9-NEXT: s_xor_b32 s9, s9, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7
; GFX9-NEXT: s_xor_b32 s5, s5, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: s_xor_b32 s6, s10, s6
; GFX9-NEXT: s_xor_b32 s4, s11, s4
; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: s_xor_b32 s4, s10, s8
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3
; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
; GFX9-NEXT: s_xor_b32 s4, s11, s9
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s8, s2, 31
; GFX10-NEXT: s_ashr_i32 s9, s3, 31
; GFX10-NEXT: s_add_i32 s2, s2, s8
; GFX10-NEXT: s_add_i32 s3, s3, s9
; GFX10-NEXT: s_xor_b32 s2, s2, s8
; GFX10-NEXT: s_xor_b32 s3, s3, s9
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX10-NEXT: s_ashr_i32 s1, s10, 31
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
; GFX10-NEXT: s_add_i32 s0, s10, s1
; GFX10-NEXT: s_add_i32 s3, s11, s2
; GFX10-NEXT: s_xor_b32 s10, s0, s1
; GFX10-NEXT: s_xor_b32 s3, s3, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX10-NEXT: s_sub_i32 s6, 0, s2
; GFX10-NEXT: s_sub_i32 s7, 0, s3
; GFX10-NEXT: s_ashr_i32 s10, s0, 31
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s11, 0, s3
; GFX10-NEXT: s_ashr_i32 s12, s9, 31
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: s_ashr_i32 s11, s1, 31
; GFX10-NEXT: s_add_i32 s0, s0, s10
; GFX10-NEXT: s_add_i32 s1, s1, s11
; GFX10-NEXT: s_xor_b32 s0, s0, s10
; GFX10-NEXT: s_xor_b32 s1, s1, s11
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
; GFX10-NEXT: s_ashr_i32 s11, s8, 31
; GFX10-NEXT: s_add_i32 s0, s8, s11
; GFX10-NEXT: s_add_i32 s8, s9, s12
; GFX10-NEXT: s_xor_b32 s0, s0, s11
; GFX10-NEXT: s_xor_b32 s8, s8, s12
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: s_xor_b32 s1, s11, s1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3
; GFX10-NEXT: s_xor_b32 s1, s10, s8
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, s11, s9
; GFX10-NEXT: s_xor_b32 s0, s12, s2
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s11, v3
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s11, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX10-NEXT: s_endpgm
Expand Down
91 changes: 43 additions & 48 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,7 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) {
; GFX8-LABEL: udivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
Expand Down Expand Up @@ -583,106 +582,102 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32
;
; GFX9-LABEL: udivrem_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_sub_i32 s6, 0, s2
; GFX9-NEXT: s_sub_i32 s7, 0, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX9-NEXT: s_sub_i32 s0, 0, s10
; GFX9-NEXT: s_sub_i32 s1, 0, s11
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3
; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3
; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2
; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX10-NEXT: s_sub_i32 s6, 0, s2
; GFX10-NEXT: s_sub_i32 s7, 0, s3
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX10-NEXT: s_sub_i32 s0, 0, s10
; GFX10-NEXT: s_sub_i32 s1, 0, s11
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s1, v1
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5]
; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7]
; GFX10-NEXT: s_endpgm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,13 @@ define void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i3
; FIXEDABI-SDAG-LABEL: addrspacecast_requires_queue_ptr:
; FIXEDABI-SDAG: ; %bb.0:
; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FIXEDABI-SDAG-NEXT: s_load_dword s4, s[6:7], 0x44
; FIXEDABI-SDAG-NEXT: s_load_dword s5, s[6:7], 0x40
; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x40
; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s4
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s5
; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s5
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s4
; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc
; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, 1
Expand All @@ -296,14 +295,13 @@ define void @addrspacecast_requires_queue_ptr(i32 addrspace(5)* %ptr.private, i3
; FIXEDABI-GISEL-LABEL: addrspacecast_requires_queue_ptr:
; FIXEDABI-GISEL: ; %bb.0:
; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FIXEDABI-GISEL-NEXT: s_load_dword s4, s[6:7], 0x44
; FIXEDABI-GISEL-NEXT: s_load_dword s5, s[6:7], 0x40
; FIXEDABI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x40
; FIXEDABI-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0
; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; FIXEDABI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s4
; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v4, s5
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v0, s5
; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc
; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v4, s4
; FIXEDABI-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1
; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc
Expand Down
134 changes: 66 additions & 68 deletions llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
Original file line number Diff line number Diff line change
Expand Up @@ -522,50 +522,49 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-LABEL: introduced_copy_to_sgpr:
; GFX908: ; %bb.0: ; %bb
; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX908-NEXT: s_load_dword s6, s[4:5], 0x18
; GFX908-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10
; GFX908-NEXT: v_mov_b32_e32 v1, 0
; GFX908-NEXT: s_load_dword s5, s[4:5], 0x18
; GFX908-NEXT: s_mov_b32 s4, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
; GFX908-NEXT: s_sub_i32 s7, 0, s1
; GFX908-NEXT: s_lshr_b32 s5, s6, 16
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX908-NEXT: s_sub_i32 s6, 0, s3
; GFX908-NEXT: s_lshl_b64 s[8:9], s[10:11], 5
; GFX908-NEXT: s_lshr_b32 s12, s5, 16
; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s5
; GFX908-NEXT: s_or_b32 s10, s10, 28
; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s5
; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s12
; GFX908-NEXT: s_or_b32 s8, s8, 28
; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX908-NEXT: v_mov_b32_e32 v7, s3
; GFX908-NEXT: s_mov_b32 s4, 0
; GFX908-NEXT: v_mov_b32_e32 v6, s2
; GFX908-NEXT: v_mul_lo_u32 v2, s7, v0
; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5
; GFX908-NEXT: v_mov_b32_e32 v6, s10
; GFX908-NEXT: v_mov_b32_e32 v7, s11
; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5
; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX908-NEXT: v_add_u32_e32 v0, v0, v2
; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX908-NEXT: v_mov_b32_e32 v2, s10
; GFX908-NEXT: v_mov_b32_e32 v3, s11
; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1
; GFX908-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX908-NEXT: v_mov_b32_e32 v2, s8
; GFX908-NEXT: v_mov_b32_e32 v3, s9
; GFX908-NEXT: v_mul_lo_u32 v4, v0, s3
; GFX908-NEXT: v_add_u32_e32 v5, 1, v0
; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4
; GFX908-NEXT: v_sub_u32_e32 v4, s2, v4
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4
; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX908-NEXT: v_subrev_u32_e32 v5, s1, v4
; GFX908-NEXT: v_subrev_u32_e32 v5, s3, v4
; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GFX908-NEXT: v_add_u32_e32 v5, 1, v0
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4
; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4
; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1]
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s0, v24
; GFX908-NEXT: s_and_b32 s0, 0xffff, s0
; GFX908-NEXT: s_mul_i32 s1, s9, s0
; GFX908-NEXT: s_mul_hi_u32 s5, s8, s0
; GFX908-NEXT: s_mul_i32 s0, s8, s0
; GFX908-NEXT: s_add_i32 s1, s5, s1
; GFX908-NEXT: v_readfirstlane_b32 s2, v24
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
; GFX908-NEXT: s_mul_i32 s1, s1, s2
; GFX908-NEXT: s_mul_hi_u32 s3, s0, s2
; GFX908-NEXT: s_mul_i32 s0, s0, s2
; GFX908-NEXT: s_add_i32 s1, s3, s1
; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %bb12
Expand Down Expand Up @@ -662,50 +661,49 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-LABEL: introduced_copy_to_sgpr:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10
; GFX90A-NEXT: s_load_dword s3, s[4:5], 0x18
; GFX90A-NEXT: s_load_dword s7, s[4:5], 0x18
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
; GFX90A-NEXT: s_mov_b32 s6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX90A-NEXT: s_sub_i32 s12, 0, s7
; GFX90A-NEXT: s_lshr_b32 s13, s3, 16
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s3
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX90A-NEXT: s_sub_i32 s12, 0, s3
; GFX90A-NEXT: s_lshr_b32 s13, s7, 16
; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13
; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5
; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5
; GFX90A-NEXT: s_or_b32 s10, s10, 28
; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX90A-NEXT: s_mov_b32 s2, 0
; GFX90A-NEXT: s_or_b32 s10, s10, 28
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0
; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8
; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8
; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s7
; GFX90A-NEXT: v_sub_u32_e32 v8, s6, v8
; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s3
; GFX90A-NEXT: v_sub_u32_e32 v8, s2, v8
; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX90A-NEXT: v_subrev_u32_e32 v9, s7, v8
; GFX90A-NEXT: v_subrev_u32_e32 v9, s3, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8
; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8
; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s3, v28
; GFX90A-NEXT: s_and_b32 s3, 0xffff, s3
; GFX90A-NEXT: s_mul_i32 s1, s1, s3
; GFX90A-NEXT: s_mul_hi_u32 s6, s0, s3
; GFX90A-NEXT: s_mul_i32 s0, s0, s3
; GFX90A-NEXT: s_add_i32 s1, s6, s1
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], 5
; GFX90A-NEXT: v_readfirstlane_b32 s2, v28
; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
; GFX90A-NEXT: s_mul_i32 s1, s1, s2
; GFX90A-NEXT: s_mul_hi_u32 s3, s0, s2
; GFX90A-NEXT: s_mul_i32 s0, s0, s2
; GFX90A-NEXT: s_add_i32 s1, s3, s1
; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5
; GFX90A-NEXT: s_branch .LBB3_2
; GFX90A-NEXT: .LBB3_1: ; %bb12
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
Expand All @@ -720,38 +718,38 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off
; GFX90A-NEXT: s_mov_b32 s3, s2
; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_mov_b32 s7, s6
; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s3, v12
; GFX90A-NEXT: v_readfirstlane_b32 s7, v12
; GFX90A-NEXT: v_readfirstlane_b32 s8, v13
; GFX90A-NEXT: s_add_u32 s3, s3, 1
; GFX90A-NEXT: s_add_u32 s7, s7, 1
; GFX90A-NEXT: s_addc_u32 s9, s8, 0
; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s3
; GFX90A-NEXT: s_mul_i32 s11, s5, s3
; GFX90A-NEXT: s_mul_i32 s8, s4, s3
; GFX90A-NEXT: s_mul_i32 s3, s4, s9
; GFX90A-NEXT: s_add_i32 s3, s10, s3
; GFX90A-NEXT: s_add_i32 s3, s3, s11
; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s7
; GFX90A-NEXT: s_mul_i32 s11, s5, s7
; GFX90A-NEXT: s_mul_i32 s8, s4, s7
; GFX90A-NEXT: s_mul_i32 s7, s4, s9
; GFX90A-NEXT: s_add_i32 s7, s10, s7
; GFX90A-NEXT: s_add_i32 s7, s7, s11
; GFX90A-NEXT: s_branch .LBB3_5
; GFX90A-NEXT: .LBB3_4: ; %bb58
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
; GFX90A-NEXT: v_mov_b32_e32 v24, s7
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s6, v14
; GFX90A-NEXT: v_mov_b32_e32 v24, s3
; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s2, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
; GFX90A-NEXT: .LBB3_5: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: v_mov_b32_e32 v25, s3
; GFX90A-NEXT: v_mov_b32_e32 v25, s7
; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14
; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc
; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/always-uniform.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
declare i32 @llvm.amdgcn.readfirstlane(i32)

; GCN-LABEL: readfirstlane_uniform
; GCN: s_load_dwordx2 s[[[IN_ADDR:[0-9]+]]:1], s[4:5], 0x0
; GCN: s_load_dwordx4 s[[[IN_ADDR:[0-9]+]]:3], s[4:5], 0x0
; GCN: v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0
; GCN: s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]]
; GCN: s_load_dword s{{[0-9]+}}, s[[[LOAD_ADDR]]
Expand Down
779 changes: 386 additions & 393 deletions llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
; HSA-PROMOTE: .end_amd_kernel_code_t

; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
; HSA-PROMOTE: s_load_dwordx2 s[{{[0-9:]+}}], s[4:5], 0x1

; SI-PROMOTE: ds_write_b32
; SI-PROMOTE: ds_write_b32
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
; GCN: s_load_dwordx4
; GCN-DAG: s_load_dwordx4
; GCN: s_load_dwordx8
; GCN-DAG: s_load_dword

; GCN: {{buffer|flat}}_store_byte
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
}

; GCN-LABEL: {{^}}s_ubfe_sub_i32:
; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]]
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[#LOAD + 2]], [[SUB]]
; GCN: s_lshr_b32 s{{[0-9]+}}, [[TMP]], [[SUB]]
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -60,9 +60,9 @@ define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
}

; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[#LOAD + 2]], [[SUB]]
; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down Expand Up @@ -119,9 +119,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
}

; GCN-LABEL: {{^}}s_sbfe_sub_i32:
; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]]
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[#LOAD + 2]], [[SUB]]
; GCN: s_ashr_i32 s{{[0-9]+}}, [[TMP]], [[SUB]]
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -134,9 +134,9 @@ define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
}

; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[#LOAD + 2]], [[SUB]]
; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand Down
253 changes: 122 additions & 131 deletions llvm/test/CodeGen/AMDGPU/bfi_int.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,77 +11,74 @@
define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_def_i32:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_andn2_b32 s6, s6, s4
; GFX7-NEXT: s_and_b32 s4, s5, s4
; GFX7-NEXT: s_or_b32 s4, s6, s4
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: s_andn2_b32 s4, s8, s6
; GFX7-NEXT: s_and_b32 s5, s7, s6
; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_def_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_andn2_b32 s4, s4, s2
; GFX8-NEXT: s_and_b32 s2, s3, s2
; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: s_and_b32 s1, s7, s6
; GFX8-NEXT: s_andn2_b32 s0, s0, s6
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_def_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_andn2_b32 s4, s4, s2
; GFX10-NEXT: s_and_b32 s2, s3, s2
; GFX10-NEXT: s_or_b32 s2, s4, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_and_b32 s1, s7, s6
; GFX10-NEXT: s_andn2_b32 s0, s0, s6
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_def_i32:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_andn2_b32 s4, s4, s2
; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2
; GFX8-GISEL-NEXT: s_or_b32 s2, s4, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6
; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6
; GFX8-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_def_i32:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x2
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_andn2_b32 s4, s4, s2
; GFX10-GISEL-NEXT: s_and_b32 s2, s3, s2
; GFX10-GISEL-NEXT: s_or_b32 s2, s4, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6
; GFX10-GISEL-NEXT: s_andn2_b32 s0, s0, s6
; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = xor i32 %x, -1
Expand Down Expand Up @@ -137,77 +134,74 @@ entry:
define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_xor_b32 s5, s5, s6
; GFX7-NEXT: s_and_b32 s4, s4, s5
; GFX7-NEXT: s_xor_b32 s4, s6, s4
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_xor_b32 s4, s7, s8
; GFX7-NEXT: s_and_b32 s4, s6, s4
; GFX7-NEXT: s_xor_b32 s4, s8, s4
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s3, s3, s4
; GFX8-NEXT: s_and_b32 s2, s2, s3
; GFX8-NEXT: s_xor_b32 s2, s4, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_xor_b32 s1, s7, s0
; GFX8-NEXT: s_and_b32 s1, s6, s1
; GFX8-NEXT: s_xor_b32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_xor_b32 s3, s3, s4
; GFX10-NEXT: s_and_b32 s2, s2, s3
; GFX10-NEXT: s_xor_b32 s2, s4, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_xor_b32 s1, s7, s0
; GFX10-NEXT: s_and_b32 s1, s6, s1
; GFX10-NEXT: s_xor_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_xor_b32 s3, s3, s4
; GFX8-GISEL-NEXT: s_and_b32 s2, s2, s3
; GFX8-GISEL-NEXT: s_xor_b32 s2, s4, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0
; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s1
; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x2
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_xor_b32 s3, s3, s4
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s3
; GFX10-GISEL-NEXT: s_xor_b32 s2, s4, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0
; GFX10-GISEL-NEXT: s_and_b32 s1, s6, s1
; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = xor i32 %y, %z
Expand Down Expand Up @@ -478,82 +472,79 @@ entry:
define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_sha256_ma:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s7, s4, s6
; GFX7-NEXT: s_or_b32 s4, s4, s6
; GFX7-NEXT: s_and_b32 s4, s5, s4
; GFX7-NEXT: s_or_b32 s4, s7, s4
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: s_or_b32 s5, s6, s8
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_and_b32 s4, s6, s8
; GFX7-NEXT: s_and_b32 s5, s7, s5
; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_sha256_ma:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s5, s2, s4
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, s3, s2
; GFX8-NEXT: s_or_b32 s2, s5, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_and_b32 s1, s6, s0
; GFX8-NEXT: s_or_b32 s0, s6, s0
; GFX8-NEXT: s_and_b32 s0, s7, s0
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_sha256_ma:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_or_b32 s5, s2, s4
; GFX10-NEXT: s_and_b32 s2, s2, s4
; GFX10-NEXT: s_and_b32 s3, s3, s5
; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_or_b32 s1, s6, s0
; GFX10-NEXT: s_and_b32 s0, s6, s0
; GFX10-NEXT: s_and_b32 s1, s7, s1
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_and_b32 s5, s2, s4
; GFX8-GISEL-NEXT: s_or_b32 s2, s2, s4
; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2
; GFX8-GISEL-NEXT: s_or_b32 s2, s5, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0
; GFX8-GISEL-NEXT: s_or_b32 s0, s6, s0
; GFX8-GISEL-NEXT: s_and_b32 s0, s7, s0
; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x2
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_or_b32 s5, s2, s4
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s4
; GFX10-GISEL-NEXT: s_and_b32 s3, s3, s5
; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s3
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0
; GFX10-GISEL-NEXT: s_and_b32 s0, s6, s0
; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s1
; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = and i32 %x, %z
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/AMDGPU/bfm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
define amdgpu_kernel void @s_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
; SI-LABEL: s_bfm_pattern:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfm_b32 s4, s4, s5
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_bfm_b32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_bfm_pattern:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfm_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down
Loading