242 changes: 121 additions & 121 deletions llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll

Large diffs are not rendered by default.

394 changes: 188 additions & 206 deletions llvm/test/CodeGen/AMDGPU/ctlz.ll

Large diffs are not rendered by default.

505 changes: 264 additions & 241 deletions llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

Large diffs are not rendered by default.

88 changes: 44 additions & 44 deletions llvm/test/CodeGen/AMDGPU/ctpop16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind {
; SI-LABEL: s_ctpop_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -27,8 +27,8 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val)
;
; VI-LABEL: s_ctpop_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -72,7 +72,7 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val)
define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -91,7 +91,7 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp
;
; VI-LABEL: v_ctpop_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -142,8 +142,8 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp
define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind {
; SI-LABEL: v_ctpop_add_chain_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
Expand All @@ -166,8 +166,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out,
;
; VI-LABEL: v_ctpop_add_chain_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -239,8 +239,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out,
define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind {
; SI-LABEL: v_ctpop_add_sgpr_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dword s12, s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dword s12, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand All @@ -259,8 +259,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p
;
; VI-LABEL: v_ctpop_add_sgpr_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dword s0, s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, p
define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -344,7 +344,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -400,7 +400,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand Down Expand Up @@ -430,7 +430,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -520,7 +520,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v8i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand Down Expand Up @@ -562,7 +562,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v8i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -700,7 +700,7 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v16i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand Down Expand Up @@ -769,7 +769,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
;
; VI-LABEL: v_ctpop_v16i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -1016,7 +1016,7 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add
define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_inline_constant:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal
;
; VI-LABEL: v_ctpop_i16_add_inline_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -1088,7 +1088,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal
define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -1107,7 +1107,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1)
;
; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1)
define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i16_add_literal:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out
;
; VI-LABEL: v_ctpop_i16_add_literal:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_movk_i32 s4, 0x3e7
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand Down Expand Up @@ -1234,8 +1234,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out
define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
; SI-LABEL: v_ctpop_i16_add_var:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dword s12, s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dword s12, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand All @@ -1254,8 +1254,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt
;
; VI-LABEL: v_ctpop_i16_add_var:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dword s0, s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -1315,8 +1315,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, pt
define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
; SI-LABEL: v_ctpop_i16_add_var_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dword s12, s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dword s12, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand All @@ -1335,8 +1335,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out
;
; VI-LABEL: v_ctpop_i16_add_var_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dword s0, s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -1396,8 +1396,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out
define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind {
; SI-LABEL: v_ctpop_i16_add_vvar_inv:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
Expand All @@ -1418,8 +1418,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
;
; VI-LABEL: v_ctpop_i16_add_vvar_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -1487,8 +1487,8 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) {
; SI-LABEL: ctpop_i16_in_br:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: s_cmp_lg_u32 s5, 0
Expand Down Expand Up @@ -1517,8 +1517,8 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s4, s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dword s4, s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s4, 16
; VI-NEXT: s_cmp_lg_u32 s5, 0
Expand Down
124 changes: 62 additions & 62 deletions llvm/test/CodeGen/AMDGPU/ctpop64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
; SI-LABEL: s_ctpop_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -28,8 +28,8 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctpop_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -66,7 +66,7 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
;
; VI-LABEL: v_ctpop_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -92,8 +92,8 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp
define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind {
; SI-LABEL: v_ctpop_i64_user:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
Expand All @@ -115,8 +115,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
;
; VI-LABEL: v_ctpop_i64_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
Expand Down Expand Up @@ -144,8 +144,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind {
; SI-LABEL: s_ctpop_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -158,8 +158,8 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
;
; VI-LABEL: s_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -178,38 +178,38 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind {
; SI-LABEL: s_ctpop_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; SI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; SI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; SI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; SI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; SI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
; SI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; SI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; VI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s15, 0xf000
; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
Expand All @@ -220,7 +220,7 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64
define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -242,7 +242,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -270,7 +270,7 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand Down Expand Up @@ -298,7 +298,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
;
; VI-LABEL: v_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down Expand Up @@ -334,11 +334,11 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr
define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
; SI-LABEL: ctpop_i64_in_br:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s0, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
; SI-NEXT: s_load_dword s8, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2
Expand All @@ -363,11 +363,11 @@ define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i64_in_br:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s0, s[2:3], 0x3c
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
; VI-NEXT: s_load_dword s8, s[0:1], 0x3c
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cmp_lg_u32 s8, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8
Expand Down Expand Up @@ -409,8 +409,8 @@ endif:
define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
; SI-LABEL: s_ctpop_i128:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -423,8 +423,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
;
; VI-LABEL: s_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -443,8 +443,8 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind {
; SI-LABEL: s_ctpop_i65:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dword s8, s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dword s8, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -460,8 +460,8 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
;
; VI-LABEL: s_ctpop_i65:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dword s8, s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dword s8, s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -484,7 +484,7 @@ define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val)
define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; SI-LABEL: v_ctpop_i128:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
Expand All @@ -507,7 +507,7 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs
;
; VI-LABEL: v_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand Down
296 changes: 148 additions & 148 deletions llvm/test/CodeGen/AMDGPU/cttz.ll

Large diffs are not rendered by default.

266 changes: 133 additions & 133 deletions llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Large diffs are not rendered by default.

511 changes: 239 additions & 272 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll

Large diffs are not rendered by default.

176 changes: 88 additions & 88 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll

Large diffs are not rendered by default.

243 changes: 126 additions & 117 deletions llvm/test/CodeGen/AMDGPU/ds_read2.ll

Large diffs are not rendered by default.

213 changes: 101 additions & 112 deletions llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Large diffs are not rendered by default.

167 changes: 79 additions & 88 deletions llvm/test/CodeGen/AMDGPU/fabs.f16.ll

Large diffs are not rendered by default.

112 changes: 56 additions & 56 deletions llvm/test/CodeGen/AMDGPU/fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -39,25 +39,25 @@ define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: s_fabsf_free:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_free:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
Expand All @@ -69,25 +69,25 @@ define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: s_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_bitset0_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_bitset0_b32 s0, 31
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
Expand All @@ -98,7 +98,7 @@ define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fabs_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -113,7 +113,7 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
;
; VI-LABEL: fabs_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset0_b32 s3, 31
; VI-NEXT: s_bitset0_b32 s2, 31
Expand All @@ -131,26 +131,26 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-LABEL: fabsf_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset0_b32 s7, 31
; SI-NEXT: s_bitset0_b32 s6, 31
; SI-NEXT: s_bitset0_b32 s5, 31
; SI-NEXT: s_bitset0_b32 s4, 31
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_bitset0_b32 s3, 31
; SI-NEXT: s_bitset0_b32 s2, 31
; SI-NEXT: s_bitset0_b32 s1, 31
; SI-NEXT: s_bitset0_b32 s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: s_bitset0_b32 s3, 31
Expand Down Expand Up @@ -202,7 +202,7 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa
define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
; SI-LABEL: fabs_fold:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -215,7 +215,7 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
;
; VI-LABEL: fabs_fold:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
Expand All @@ -232,23 +232,23 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i
define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: bitpreserve_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_add_f32_e64 v0, |s4|, 1.0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: bitpreserve_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_add_f32_e64 v2, |s2|, 1.0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
Expand Down
512 changes: 238 additions & 274 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize.ll

Large diffs are not rendered by default.

659 changes: 324 additions & 335 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Large diffs are not rendered by default.

337 changes: 168 additions & 169 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll

Large diffs are not rendered by default.

358 changes: 176 additions & 182 deletions llvm/test/CodeGen/AMDGPU/fdiv.ll

Large diffs are not rendered by default.

3,322 changes: 1,661 additions & 1,661 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll

Large diffs are not rendered by default.

306 changes: 153 additions & 153 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll

Large diffs are not rendered by default.

788 changes: 377 additions & 411 deletions llvm/test/CodeGen/AMDGPU/fma-combine.ll

Large diffs are not rendered by default.

324 changes: 162 additions & 162 deletions llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

Large diffs are not rendered by default.

500 changes: 208 additions & 292 deletions llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll

Large diffs are not rendered by default.

123 changes: 61 additions & 62 deletions llvm/test/CodeGen/AMDGPU/fnearbyint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
; SI-LABEL: fnearbyint_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -28,24 +28,23 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; CI-LABEL: fnearbyint_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[2:3], 0xb
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_load_dword s2, s[0:1], 0xb
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: v_rndne_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f16_e32 v2, s4
; VI-NEXT: v_rndne_f16_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
Expand All @@ -54,11 +53,11 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v1, s4
; GFX11-NEXT: v_rndne_f16_e32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -71,8 +70,8 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
; SICI-LABEL: fnearbyint_f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dword s4, s[2:3], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SICI-NEXT: s_load_dword s4, s[0:1], 0xb
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -82,10 +81,10 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v2, s4
; VI-NEXT: v_rndne_f32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -94,11 +93,11 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s4
; GFX11-NEXT: v_rndne_f32_e32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -112,7 +111,7 @@ entry:
define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 {
; SICI-LABEL: fnearbyint_v2f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SICI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SICI-NEXT: s_mov_b32 s7, 0xf000
; SICI-NEXT: s_mov_b32 s6, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -125,7 +124,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fnearbyint_v2f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_rndne_f32_e32 v1, s3
Expand All @@ -136,7 +135,7 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; GFX11-LABEL: fnearbyint_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v1, s3
Expand All @@ -154,8 +153,8 @@ entry:
define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 {
; SICI-LABEL: fnearbyint_v4f32:
; SICI: ; %bb.0: ; %entry
; SICI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SICI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SICI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SICI-NEXT: s_mov_b32 s3, 0xf000
; SICI-NEXT: s_mov_b32 s2, -1
; SICI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -168,8 +167,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v5, s1
Expand All @@ -183,8 +182,8 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11-LABEL: fnearbyint_v4f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
Expand All @@ -204,7 +203,7 @@ entry:
define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-LABEL: nearbyint_f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_brev_b32 s8, -2
Expand All @@ -228,7 +227,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; CI-LABEL: nearbyint_f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; CI-NEXT: s_mov_b32 s3, 0xf000
Expand All @@ -238,7 +237,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s0
Expand All @@ -248,7 +247,7 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
Expand All @@ -264,41 +263,41 @@ entry:
define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
; SI-LABEL: nearbyint_v2f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: v_mov_b32_e32 v4, s8
; SI-NEXT: v_mov_b32_e32 v5, s9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v7, s7
; SI-NEXT: v_mov_b32_e32 v7, s3
; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
; SI-NEXT: v_mov_b32_e32 v8, s6
; SI-NEXT: v_mov_b32_e32 v9, s5
; SI-NEXT: v_mov_b32_e32 v10, s4
; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
; SI-NEXT: v_mov_b32_e32 v8, s2
; SI-NEXT: v_mov_b32_e32 v9, s1
; SI-NEXT: v_mov_b32_e32 v10, s0
; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; CI-LABEL: nearbyint_v2f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -309,8 +308,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
;
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
Expand All @@ -322,8 +321,8 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11-LABEL: nearbyint_v2f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
Expand All @@ -341,8 +340,8 @@ entry:
define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
; SI-LABEL: nearbyint_v4f64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x11
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_brev_b32 s14, -2
Expand Down Expand Up @@ -391,8 +390,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; CI-LABEL: nearbyint_v4f64:
; CI: ; %bb.0: ; %entry
; CI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -406,8 +405,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
;
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
Expand All @@ -426,8 +425,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-LABEL: nearbyint_v4f64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2799,7 +2799,7 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -2813,7 +2813,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fneg_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down Expand Up @@ -3016,41 +3016,41 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float
define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0xd
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s4, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; SI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; SI-NEXT: s_cselect_b32 s0, 0, s0
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: s_cselect_b32 s2, 0, s2
; SI-NEXT: v_mov_b32_e32 v3, s1
; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v3, s3
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_fneg_select_infloop_regression_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x34
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s4, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
; VI-NEXT: s_cselect_b32 s0, 0, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_cselect_b32 s2, 0, s2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, double 0.0, double %arg
Expand Down Expand Up @@ -3080,11 +3080,11 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
; SI-NEXT: s_load_dword s2, s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
; SI-NEXT: s_bitcmp1_b32 s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: s_bitcmp1_b32 s2, 16
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, s[2:3]
Expand All @@ -3096,11 +3096,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
;
; VI-LABEL: s_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
; VI-NEXT: s_load_dword s2, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s4, 16
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_bitcmp1_b32 s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
Expand Down Expand Up @@ -3146,7 +3146,7 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s1, 1, s1
; SI-NEXT: s_cselect_b32 s0, 0, s0
Expand All @@ -3161,7 +3161,7 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
;
; VI-LABEL: s_fneg_select_infloop_regression_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, 1, s1
; VI-NEXT: s_cselect_b32 s0, 0, s0
Expand Down Expand Up @@ -3216,8 +3216,8 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_select_infloop_regression_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: v_bfrev_b32_e32 v0, 1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s6, 0
Expand All @@ -3235,8 +3235,8 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
;
; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s6, 0
Expand Down Expand Up @@ -3279,7 +3279,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fabs_select_infloop_regression_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -3293,7 +3293,7 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
;
; VI-LABEL: s_fabs_select_infloop_regression_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down Expand Up @@ -3329,7 +3329,7 @@ define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1, ptr addrspace(1) %ptr) {
; SI-LABEL: s_fneg_fabs_select_infloop_regression:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s1, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
Expand All @@ -3343,7 +3343,7 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
;
; VI-LABEL: s_fneg_fabs_select_infloop_regression:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
Expand Down
178 changes: 89 additions & 89 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

Large diffs are not rendered by default.

48 changes: 24 additions & 24 deletions llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) {
; SI-LABEL: fneg_fabsf_fadd_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -18,7 +18,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fadd_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_sub_f32_e64 v2, s3, |v0|
Expand All @@ -36,7 +36,7 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) {
; SI-LABEL: fneg_fabsf_fmul_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
Expand All @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
;
; VI-LABEL: fneg_fabsf_fmul_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0|
Expand All @@ -67,22 +67,22 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
; SI-LABEL: fneg_fabsf_free_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s4, 31
; SI-NEXT: s_or_b32 s4, s2, 0x80000000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabsf_free_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s4, 0x80000000
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -129,22 +129,22 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in
define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
; SI-LABEL: fneg_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[2:3], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s4, 31
; SI-NEXT: s_or_b32 s4, s2, 0x80000000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fneg_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s4, 0x80000000
; VI-NEXT: s_bitset1_b32 s2, 31
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -159,7 +159,7 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: v_fneg_fabsf_f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s10, s6
Expand All @@ -177,7 +177,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: v_fneg_fabsf_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
Expand All @@ -198,7 +198,7 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fneg_fabsf_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s3, 31
Expand All @@ -213,7 +213,7 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
;
; VI-LABEL: fneg_fabsf_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s3, 31
; VI-NEXT: s_bitset1_b32 s2, 31
Expand All @@ -232,8 +232,8 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %
define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; SI-LABEL: fneg_fabsf_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitset1_b32 s7, 31
Expand All @@ -250,8 +250,8 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fneg_fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s7, 0x80000000
; VI-NEXT: s_or_b32 s3, s6, 0x80000000
Expand Down
218 changes: 108 additions & 110 deletions llvm/test/CodeGen/AMDGPU/fneg.ll

Large diffs are not rendered by default.

50 changes: 17 additions & 33 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,18 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret void
Expand All @@ -42,24 +36,22 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
Expand All @@ -73,11 +65,8 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
Expand All @@ -87,11 +76,8 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
ret <2 x half> %ret
Expand All @@ -105,11 +91,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
Expand All @@ -119,11 +104,10 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
Expand All @@ -132,7 +116,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -141,7 +125,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -180,7 +164,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
Expand All @@ -189,7 +173,7 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
Expand Down Expand Up @@ -228,7 +212,7 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
Expand All @@ -238,7 +222,7 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
Expand Down
376 changes: 331 additions & 45 deletions llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll

Large diffs are not rendered by default.

322 changes: 161 additions & 161 deletions llvm/test/CodeGen/AMDGPU/fp-classify.ll

Large diffs are not rendered by default.

263 changes: 135 additions & 128 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll

Large diffs are not rendered by default.

249 changes: 128 additions & 121 deletions llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp16_to_fp32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -28,7 +28,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp16_to_fp32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp16_to_fp64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp16_to_fp64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -46,7 +46,7 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp16_to_fp64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
; GFX6-LABEL: test_convert_fp32_to_fp16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s10, s6
Expand All @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
;
; GFX8-LABEL: test_convert_fp32_to_fp16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_mov_b32 s10, s6
Expand All @@ -45,7 +45,7 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
;
; GFX11-LABEL: test_convert_fp32_to_fp16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s10, s6
Expand Down
148 changes: 74 additions & 74 deletions llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i
define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
; SI-LABEL: raw_buffer_atomic_min_noret_f64:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT: s_load_dword s6, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_load_dword s6, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -28,9 +28,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -41,22 +41,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v2, s8
; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -66,9 +66,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
; G_SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -78,9 +78,9 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
;
; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
; G_GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -91,22 +91,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x2
; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down Expand Up @@ -253,9 +253,9 @@ main_body:
define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
; SI-LABEL: raw_buffer_atomic_max_noret_f64:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; SI-NEXT: s_load_dword s6, s[2:3], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_load_dword s6, s[0:1], 0xf
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -265,9 +265,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -278,22 +278,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_e32 v2, s8
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -303,9 +303,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_SI-NEXT: s_load_dword s6, s[2:3], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_SI-NEXT: s_load_dword s6, s[0:1], 0xf
; G_SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
; G_SI-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -315,9 +315,9 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
;
; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[2:3], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; G_GFX7-NEXT: s_load_dword s6, s[0:1], 0xf
; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
; G_GFX7-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -328,22 +328,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_clause 0x2
; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[2:3], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s0
; G_GFX10-NEXT: v_mov_b32_e32 v1, s1
; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[2:3], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down Expand Up @@ -424,7 +424,7 @@ main_body:
define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, ptr addrspace(3) %out) {
; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; SI: ; %bb.0: ; %main_body
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 m0, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -438,7 +438,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX7: ; %bb.0: ; %main_body
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -452,20 +452,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s4
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s6
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s7
; GFX10-NEXT: v_mov_b32_e32 v0, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: v_mov_b32_e32 v2, s10
; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; GFX10-NEXT: v_mov_b32_e32 v2, s11
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v2, v[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -478,7 +478,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_SI: ; %bb.0: ; %main_body
; G_SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; G_SI-NEXT: s_mov_b32 m0, -1
; G_SI-NEXT: s_waitcnt lgkmcnt(0)
; G_SI-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -492,7 +492,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX7: ; %bb.0: ; %main_body
; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; G_GFX7-NEXT: s_mov_b32 m0, -1
; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX7-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -506,20 +506,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre
;
; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX10: ; %bb.0: ; %main_body
; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v0, s4
; G_GFX10-NEXT: v_mov_b32_e32 v1, s5
; G_GFX10-NEXT: v_mov_b32_e32 v2, s6
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s7
; G_GFX10-NEXT: v_mov_b32_e32 v0, s8
; G_GFX10-NEXT: v_mov_b32_e32 v1, s9
; G_GFX10-NEXT: v_mov_b32_e32 v2, s10
; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
; G_GFX10-NEXT: v_mov_b32_e32 v2, s11
; G_GFX10-NEXT: s_waitcnt vmcnt(0)
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_endpgm
;
; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
Expand Down
Loading