114 changes: 57 additions & 57 deletions llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32:
Expand Down Expand Up @@ -117,14 +117,14 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_src_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
Expand Down Expand Up @@ -194,13 +194,13 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_dbg_use_src_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_dbg_use_src_f32:
Expand Down Expand Up @@ -267,14 +267,14 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_add_neg_src_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_neg_src_f32:
Expand Down Expand Up @@ -342,14 +342,14 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_non_clamp_max_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_non_clamp_max_f32:
Expand Down Expand Up @@ -413,13 +413,13 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_add_src_f32_denormals:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32_denormals:
Expand Down Expand Up @@ -485,13 +485,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_add_src_f16_denorm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_denorm:
Expand Down Expand Up @@ -557,13 +557,13 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
Expand Down Expand Up @@ -629,14 +629,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_add_src_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f32:
Expand Down Expand Up @@ -701,13 +701,13 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f64:
Expand Down Expand Up @@ -866,13 +866,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
Expand Down Expand Up @@ -947,13 +947,13 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
Expand Down Expand Up @@ -1038,14 +1038,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
Expand Down Expand Up @@ -1124,14 +1124,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
Expand Down Expand Up @@ -1212,14 +1212,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
Expand Down Expand Up @@ -1298,14 +1298,14 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
Expand Down Expand Up @@ -1382,14 +1382,14 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
Expand Down Expand Up @@ -1469,14 +1469,14 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
Expand Down Expand Up @@ -1553,15 +1553,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
Expand Down
260 changes: 130 additions & 130 deletions llvm/test/CodeGen/AMDGPU/clamp.ll

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadCombine:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
Expand All @@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadShuffle:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN-NEXT: s_mov_b32 s0, 0x7050604
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
; GCN-NEXT: s_mov_b32 s0, 0x7050604
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_perm_b32 v2, v2, v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252
; GCN-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
; GCN-NEXT: s_cselect_b32 s2, 2, 3
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: global_store_dword v1, v0, s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
; GCN-NEXT: s_cselect_b32 s0, 2, 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: global_store_dword v1, v0, s[4:5]
; GCN-NEXT: s_endpgm
entry: ; preds = %1009
%0 = load i32, ptr addrspace(1) %in, align 4
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2838,16 +2838,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; GFX10-LABEL: cvt_ubyte0_or_multiuse:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
; GFX10-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
Expand Down
212 changes: 106 additions & 106 deletions llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -277,25 +277,25 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_i16_LL:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_i16_LL:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -561,25 +561,25 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_f16_LL:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; GFX906-LABEL: uniform_vec_f16_LL:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
Expand Down
46 changes: 23 additions & 23 deletions llvm/test/CodeGen/AMDGPU/ds_write2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
Expand Down Expand Up @@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
Expand Down Expand Up @@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand Down Expand Up @@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
Expand Down Expand Up @@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
Expand Down Expand Up @@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
; GFX9-ALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
Expand All @@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
Expand Down
136 changes: 68 additions & 68 deletions llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -197,24 +197,24 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX9-LABEL: v_rcp_f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16:
Expand Down Expand Up @@ -293,24 +293,24 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rcp_f16_abs:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, |v1|
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_abs:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, |v1|
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_abs:
Expand Down Expand Up @@ -392,24 +392,24 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
;
; GFX9-LABEL: reciprocal_f16_rounded:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: reciprocal_f16_rounded:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: reciprocal_f16_rounded:
Expand Down Expand Up @@ -475,24 +475,24 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rcp_f16_afn:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_afn:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_afn:
Expand Down Expand Up @@ -571,24 +571,24 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rcp_f16_neg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_neg:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_neg:
Expand Down Expand Up @@ -670,24 +670,24 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX9-LABEL: v_rsq_f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16:
Expand Down Expand Up @@ -771,26 +771,26 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rsq_f16_neg:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_neg:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_neg:
Expand Down Expand Up @@ -879,28 +879,28 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX9-LABEL: v_rsq_f16_multi_use:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_short v0, v2, s[0:1]
; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_multi_use:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_multi_use:
Expand Down Expand Up @@ -987,26 +987,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
;
; GFX9-LABEL: v_rsq_f16_missing_contract0:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract0:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract0:
Expand Down Expand Up @@ -1092,26 +1092,26 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
;
; GFX9-LABEL: v_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract1:
Expand Down Expand Up @@ -1197,26 +1197,26 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
;
; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
Expand Down
120 changes: 60 additions & 60 deletions llvm/test/CodeGen/AMDGPU/flat_atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5279,15 +5279,15 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -5328,15 +5328,15 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -5689,15 +5689,15 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_f32_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -5738,15 +5738,15 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_f32:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -6099,15 +6099,15 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i8_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -6148,15 +6148,15 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i8:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -6394,15 +6394,15 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i16_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -6443,15 +6443,15 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_i16:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -7979,15 +7979,15 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_f16_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
Expand Down Expand Up @@ -8027,15 +8027,15 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_f16:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
Expand Down Expand Up @@ -8078,15 +8078,15 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_bf16_offset:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
Expand Down Expand Up @@ -8126,15 +8126,15 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
;
; GCN3-LABEL: atomic_load_bf16:
; GCN3: ; %bb.0:
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
Expand Down
130 changes: 65 additions & 65 deletions llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4439,23 +4439,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
Expand All @@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -4654,23 +4654,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN3-LABEL: atomic_max_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
Expand All @@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -5821,23 +5821,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
Expand All @@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -5934,23 +5934,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
Expand All @@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -8045,23 +8045,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
Expand All @@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down Expand Up @@ -8148,30 +8148,30 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN3-LABEL: atomic_min_i64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN3-NEXT: s_mov_b64 s[4:5], 0
; GCN3-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: v_mov_b32_e32 v5, s1
; GCN3-NEXT: v_mov_b32_e32 v6, s3
; GCN3-NEXT: v_mov_b32_e32 v7, s2
; GCN3-NEXT: v_mov_b32_e32 v4, s0
; GCN3-NEXT: v_mov_b32_e32 v4, s4
; GCN3-NEXT: v_mov_b32_e32 v6, s7
; GCN3-NEXT: v_mov_b32_e32 v7, s6
; GCN3-NEXT: v_mov_b32_e32 v5, s5
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN3-NEXT: v_mov_b32_e32 v3, v1
; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v2, v0
; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
Expand Down Expand Up @@ -8253,23 +8253,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN3-LABEL: atomic_min_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
; GCN3-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GCN3-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
; GCN3-NEXT: s_add_u32 s0, s0, s6
; GCN3-NEXT: s_addc_u32 s1, s1, s7
; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GCN3-NEXT: s_add_u32 s0, s4, s0
; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: v_mov_b32_e32 v4, s5
; GCN3-NEXT: v_mov_b32_e32 v5, s4
; GCN3-NEXT: v_mov_b32_e32 v4, s9
; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
Expand All @@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v0, s2
; GCN3-NEXT: v_mov_b32_e32 v1, s3
; GCN3-NEXT: v_mov_b32_e32 v0, s6
; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
Expand Down
152 changes: 76 additions & 76 deletions llvm/test/CodeGen/AMDGPU/fmax3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmax3_olt_0_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f32:
Expand Down Expand Up @@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmax3_olt_1_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f32:
Expand Down Expand Up @@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmax3_olt_0_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f16:
Expand Down Expand Up @@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmax3_olt_1_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f16:
Expand Down
564 changes: 282 additions & 282 deletions llvm/test/CodeGen/AMDGPU/fmed3.ll

Large diffs are not rendered by default.

228 changes: 114 additions & 114 deletions llvm/test/CodeGen/AMDGPU/fmin3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -67,32 +67,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_0_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f32:
Expand Down Expand Up @@ -199,32 +199,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_1_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1
; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f32:
Expand Down Expand Up @@ -338,32 +338,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_0_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2
; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f16:
Expand Down Expand Up @@ -478,32 +478,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_1_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1
; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f16:
Expand Down Expand Up @@ -680,36 +680,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_0_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f64:
Expand Down Expand Up @@ -827,36 +827,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: test_fmin3_olt_1_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s14, s10
; GFX9-NEXT: s_mov_b32 s15, s11
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s2
; GFX9-NEXT: s_mov_b32 s13, s3
; GFX9-NEXT: s_mov_b32 s16, s4
; GFX9-NEXT: s_mov_b32 s17, s5
; GFX9-NEXT: s_mov_b32 s18, s10
; GFX9-NEXT: s_mov_b32 s19, s11
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s16, s8
; GFX9-NEXT: s_mov_b32 s17, s9
; GFX9-NEXT: s_mov_b32 s18, s2
; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
; GFX9-NEXT: s_mov_b32 s8, s10
; GFX9-NEXT: s_mov_b32 s9, s11
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f64:
Expand Down
180 changes: 108 additions & 72 deletions llvm/test/CodeGen/AMDGPU/fmul.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -111,23 +111,41 @@ define amdgpu_kernel void @fmul_f16_imm_a(
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX89-LABEL: fmul_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: s_mov_b32 s10, s6
; GFX89-NEXT: s_mov_b32 s11, s7
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s8, s2
; GFX89-NEXT: s_mov_b32 s9, s3
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_mov_b32 s4, s0
; GFX89-NEXT: s_mov_b32 s5, s1
; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
; VI-LABEL: fmul_f16_imm_a:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_mov_b32 s11, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
Expand Down Expand Up @@ -178,23 +196,41 @@ define amdgpu_kernel void @fmul_f16_imm_b(
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; GFX89-LABEL: fmul_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: s_mov_b32 s10, s6
; GFX89-NEXT: s_mov_b32 s11, s7
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s8, s2
; GFX89-NEXT: s_mov_b32 s9, s3
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_mov_b32 s4, s0
; GFX89-NEXT: s_mov_b32 s5, s1
; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
; VI-LABEL: fmul_f16_imm_b:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_mov_b32 s11, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
Expand Down Expand Up @@ -390,21 +426,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
;
; GFX9-LABEL: fmul_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s10, s6
; GFX9-NEXT: s_mov_b32 s11, s7
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s2
; GFX9-NEXT: s_mov_b32 s9, s3
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x44004200
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0x44004200
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_a:
Expand Down Expand Up @@ -485,21 +521,21 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
;
; GFX9-LABEL: fmul_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s10, s6
; GFX9-NEXT: s_mov_b32 s11, s7
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s2
; GFX9-NEXT: s_mov_b32 s9, s3
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x42004400
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0x42004400
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_b:
Expand Down Expand Up @@ -725,23 +761,23 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
;
; GFX9-LABEL: fmul_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s10, s6
; GFX9-NEXT: s_mov_b32 s11, s7
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s2
; GFX9-NEXT: s_mov_b32 s9, s3
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s2, 0x44004200
; GFX9-NEXT: s_mov_b32 s3, 0x40004800
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s6, 0x44004200
; GFX9-NEXT: s_mov_b32 s7, 0x40004800
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6
; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v4f16_imm_a:
Expand Down
70 changes: 35 additions & 35 deletions llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -62,32 +62,32 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
Expand Down Expand Up @@ -176,48 +176,48 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-FLUSH-LABEL: fmul_fadd_f16:
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-STRICT-NEXT: s_clause 0x2
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_f16:
Expand Down Expand Up @@ -326,32 +326,32 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX10-DENORM: ; %bb.0:
; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
Expand Down
Loading