124 changes: 62 additions & 62 deletions llvm/test/CodeGen/AMDGPU/ctpop64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
; SI-LABEL: s_ctpop_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctpop_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
;
; VI-LABEL: v_ctpop_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind {
; SI-LABEL: v_ctpop_i64_user:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand All @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
;
; VI-LABEL: v_ctpop_i64_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind {
; SI-LABEL: s_ctpop_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
;
; VI-LABEL: s_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind {
; SI-LABEL: s_ctpop_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s15, 0xf000
; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
Expand All @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64
define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand Down Expand Up @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
; SI-LABEL: ctpop_i64_in_br:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s8, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; SI-NEXT: s_load_dword s0, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2
Expand All @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i64_in_br:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s8, s[0:1], 0x3c
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_load_dword s0, s[2:3], 0x3c
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s8, 0
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8
Expand Down Expand Up @@ -409,8 +409,8 @@ endif:
define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
; SI-LABEL: s_ctpop_i128:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
;
; VI-LABEL: s_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind {
; SI-LABEL: s_ctpop_i65:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dword s8, s[0:1], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dword s8, s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
;
; VI-LABEL: s_ctpop_i65:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dword s8, s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dword s8, s[2:3], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i128:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs
;
; VI-LABEL: v_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down
168 changes: 84 additions & 84 deletions llvm/test/CodeGen/AMDGPU/cttz.ll

Large diffs are not rendered by default.

166 changes: 83 additions & 83 deletions llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Large diffs are not rendered by default.

251 changes: 141 additions & 110 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

152 changes: 76 additions & 76 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll

Large diffs are not rendered by default.

243 changes: 117 additions & 126 deletions llvm/test/CodeGen/AMDGPU/ds_read2.ll

Large diffs are not rendered by default.

213 changes: 112 additions & 101 deletions llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Large diffs are not rendered by default.

167 changes: 88 additions & 79 deletions llvm/test/CodeGen/AMDGPU/fabs.f16.ll

Large diffs are not rendered by default.

112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AMDGPU/fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_free:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_free:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
Expand All @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: s_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
Expand All @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fabs_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; VI-LABEL: fabs_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
Expand All @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-LABEL: fabsf_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s3, 31
; SI-NEXT: s_bitset0_b32 s2, 31
; SI-NEXT: s_bitset0_b32 s1, 31
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_bitset0_b32 s7, 31
; SI-NEXT: s_bitset0_b32 s6, 31
; SI-NEXT: s_bitset0_b32 s5, 31
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_bitset0_b32 s3, 31
Expand Down Expand Up @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa
define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; SI-LABEL: fabs_fold:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
;
; VI-LABEL: fabs_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
Expand All @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: bitpreserve_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bitpreserve_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
Expand Down
512 changes: 274 additions & 238 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.ll

Large diffs are not rendered by default.

641 changes: 326 additions & 315 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Large diffs are not rendered by default.

337 changes: 169 additions & 168 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll

Large diffs are not rendered by default.

290 changes: 148 additions & 142 deletions llvm/test/CodeGen/AMDGPU/fdiv.ll

Large diffs are not rendered by default.

3,306 changes: 1,653 additions & 1,653 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll

Large diffs are not rendered by default.

258 changes: 129 additions & 129 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll

Large diffs are not rendered by default.

788 changes: 411 additions & 377 deletions llvm/test/CodeGen/AMDGPU/fma-combine.ll

Large diffs are not rendered by default.

324 changes: 162 additions & 162 deletions llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

Large diffs are not rendered by default.

356 changes: 220 additions & 136 deletions llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll

Large diffs are not rendered by default.

123 changes: 62 additions & 61 deletions llvm/test/CodeGen/AMDGPU/fnearbyint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
; SI-LABEL: fnearbyint_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -28,23 +28,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; CI-LABEL: fnearbyint_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[0:1], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_load_dword s0, s[2:3], 0xb
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: v_rndne_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f16_e32 v2, s2
; VI-NEXT: v_rndne_f16_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
Expand All @@ -53,11 +54,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v1, s2
; GFX11-NEXT: v_rndne_f16_e32 v1, s4
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -70,8 +71,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
; SICI-LABEL: fnearbyint_f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dword s4, s[0:1], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SICI-NEXT: s_load_dword s4, s[2:3], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -81,10 +82,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v2, s2
; VI-NEXT: v_rndne_f32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -93,11 +94,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s2
; GFX11-NEXT: v_rndne_f32_e32 v1, s4
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -111,7 +112,7 @@ entry:
define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 {
; SICI-LABEL: fnearbyint_v2f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SICI-NEXT: s_mov_b32 s7, 0xf000
; SICI-NEXT: s_mov_b32 s6, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -124,7 +125,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fnearbyint_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rndne_f32_e32 v1, s3
Expand All @@ -135,7 +136,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; GFX11-LABEL: fnearbyint_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s3
Expand All @@ -153,8 +154,8 @@ entry:
define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 {
; SICI-LABEL: fnearbyint_v4f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -167,8 +168,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v5, s1
Expand All @@ -182,8 +183,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11-LABEL: fnearbyint_v4f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
Expand All @@ -203,7 +204,7 @@ entry:
define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-LABEL: nearbyint_f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_brev_b32 s8, -2
Expand All @@ -227,7 +228,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; CI-LABEL: nearbyint_f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; CI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -237,7 +238,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s0
Expand All @@ -247,7 +248,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
Expand All @@ -263,41 +264,41 @@ entry:
define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
; SI-LABEL: nearbyint_v2f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, s3
; SI-NEXT: v_mov_b32_e32 v7, s7
; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
; SI-NEXT: v_mov_b32_e32 v8, s2
; SI-NEXT: v_mov_b32_e32 v9, s1
; SI-NEXT: v_mov_b32_e32 v10, s0
; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
; SI-NEXT: v_mov_b32_e32 v8, s6
; SI-NEXT: v_mov_b32_e32 v9, s5
; SI-NEXT: v_mov_b32_e32 v10, s4
; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; CI-LABEL: nearbyint_v2f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -308,8 +309,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
;
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
Expand All @@ -321,8 +322,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11-LABEL: nearbyint_v2f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
Expand All @@ -340,8 +341,8 @@ entry:
define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
; SI-LABEL: nearbyint_v4f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11
; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_brev_b32 s14, -2
Expand Down Expand Up @@ -390,8 +391,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; CI-LABEL: nearbyint_v4f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -405,8 +406,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
Expand All @@ -425,8 +426,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-LABEL: nearbyint_v4f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fneg_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down Expand Up @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s4, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; SI-NEXT: s_cselect_b32 s2, 0, s2
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fneg_select_infloop_regression_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s4, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; VI-NEXT: s_cselect_b32 s2, 0, s2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, double 0.0, double %arg
Expand Down Expand Up @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_load_dword s4, s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: s_bitcmp1_b32 s2, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
; SI-NEXT: s_bitcmp1_b32 s4, 16
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
Expand All @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
;
; VI-LABEL: s_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_load_dword s4, s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitcmp1_b32 s4, 16
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
Expand Down Expand Up @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s1, 1, s1
; SI-NEXT: s_cselect_b32 s0, 0, s0
Expand All @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
;
; VI-LABEL: s_fneg_select_infloop_regression_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, 1, s1
; VI-NEXT: s_cselect_b32 s0, 0, s0
Expand Down Expand Up @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s6, 0
Expand All @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
;
; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s6, 0
Expand Down Expand Up @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fabs_select_infloop_regression_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fabs_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down Expand Up @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_fabs_select_infloop_regression:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
;
; VI-LABEL: s_fneg_fabs_select_infloop_regression:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down
178 changes: 89 additions & 89 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) {
; SI-LABEL: fneg_fabsf_fadd_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fadd_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_sub_f32_e64 v2, s3, |v0|
Expand All @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) {
; SI-LABEL: fneg_fabsf_fmul_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fmul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0|
Expand All @@ -67,22 +67,22 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: fneg_fabsf_free_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_or_b32 s4, s2, 0x80000000
; SI-NEXT: s_bitset1_b32 s4, 31
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabsf_free_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: s_or_b32 s2, s4, 0x80000000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -129,22 +129,22 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in
define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: fneg_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_or_b32 s4, s2, 0x80000000
; SI-NEXT: s_bitset1_b32 s4, 31
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: s_or_b32 s2, s4, 0x80000000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_fneg_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s10, s6
Expand All @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: v_fneg_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fneg_fabsf_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s3, 31
Expand All @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fneg_fabsf_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s3, 31
; VI-NEXT: s_bitset1_b32 s2, 31
Expand All @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-LABEL: fneg_fabsf_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s7, 31
Expand All @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fneg_fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s7, 0x80000000
; VI-NEXT: s_or_b32 s3, s6, 0x80000000
Expand Down
218 changes: 110 additions & 108 deletions llvm/test/CodeGen/AMDGPU/fneg.ll

Large diffs are not rendered by default.

50 changes: 33 additions & 17 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,24 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret void
Expand All @@ -36,22 +42,24 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
Expand All @@ -65,8 +73,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
Expand All @@ -76,8 +87,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
Expand All @@ -91,10 +105,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
Expand All @@ -104,10 +119,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
Expand All @@ -116,7 +132,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -125,7 +141,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -164,7 +180,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -173,7 +189,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -212,7 +228,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
Expand All @@ -222,7 +238,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
Expand Down
82 changes: 45 additions & 37 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@ declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %
define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -37,7 +37,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
Expand All @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_mov_b32 s0, 0
Expand All @@ -58,7 +58,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
Expand All @@ -76,7 +77,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
Expand All @@ -88,7 +89,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_mov_b32 s0, 0
Expand All @@ -97,7 +98,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
Expand Down Expand Up @@ -160,8 +162,9 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3
; GFX12-NEXT: global_wb scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
Expand All @@ -180,17 +183,17 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX940-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -225,17 +228,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -270,17 +273,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
; GFX940-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3]
; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
Expand Down Expand Up @@ -316,19 +319,23 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: ds_pk_add_f16 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret void
Expand All @@ -349,8 +356,11 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
Expand All @@ -359,24 +369,23 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
;
; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
Expand All @@ -386,10 +395,8 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
Expand All @@ -399,10 +406,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
Expand Down
322 changes: 161 additions & 161 deletions llvm/test/CodeGen/AMDGPU/fp-classify.ll

Large diffs are not rendered by default.

245 changes: 119 additions & 126 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll

Large diffs are not rendered by default.

231 changes: 112 additions & 119 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp16_to_fp32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp16_to_fp32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp16_to_fp64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp16_to_fp64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp32_to_fp16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp32_to_fp16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp32_to_fp16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
128 changes: 64 additions & 64 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i
define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
; SI-LABEL: raw_buffer_atomic_min_noret_f64:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_load_dword s6, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT: s_load_dword s6, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -41,22 +41,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s8
; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
; G_SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
; G_GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -91,22 +91,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x2
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down Expand Up @@ -253,9 +253,9 @@ main_body:
define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
; SI-LABEL: raw_buffer_atomic_max_noret_f64:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_load_dword s6, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT: s_load_dword s6, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -278,22 +278,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s8
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
; G_SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
; G_GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -328,22 +328,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x2
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down Expand Up @@ -424,7 +424,7 @@ main_body:
define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -452,7 +452,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
Expand All @@ -465,7 +465,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; G_SI-NEXT: s_mov_b32 m0, -1
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; G_GFX7-NEXT: s_mov_b32 m0, -1
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -506,7 +506,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
Expand All @@ -519,7 +519,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down
Loading