diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll deleted file mode 100644 index bcfbb734f8e04..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ /dev/null @@ -1,1108 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s -; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s - -define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v2, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %a.fneg = fsub float -0.0, %a - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %a.fneg = fsub float -0.0, %a - %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) - %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) - %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4| -; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, -1.0, v7 -; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| -; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 -; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, |v2|, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - - %a.fneg = fsub float -0.0, %a - %b.fabs = call float @llvm.fabs.f32(float %b) - %c.fabs = call float @llvm.fabs.f32(float %c) - %c.fabs.fneg = fsub float -0.0, %c.fabs - - %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) - %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, -1.0, |v2| -; SI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| -; SI-NEXT: v_mul_f32_e64 v4, -1.0, |v4| -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e64 v4, -1.0, |v7| -; VI-NEXT: v_mul_f32_e64 v2, -1.0, |v2| -; VI-NEXT: v_mul_f32_e64 v3, -1.0, |v3| -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| -; GFX9-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| -; GFX9-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| -; GFX10-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| -; GFX10-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| -; GFX11-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| -; GFX11-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - - %a.fabs = call float @llvm.fabs.f32(float %a) - %a.fabs.fneg = fsub float -0.0, %a.fabs - %b.fabs = call float @llvm.fabs.f32(float %b) - %b.fabs.fneg = fsub float -0.0, %b.fabs - %c.fabs = call float @llvm.fabs.f32(float %c) - %c.fabs.fneg = fsub float -0.0, %c.fabs - - %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - - store float %med3, ptr addrspace(1) %outgep - ret void -} - -define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_nnan_inputs_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_nnan_inputs_med3_f32_pat0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX10-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 -; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - - %a.nnan = fadd nnan float %a, 1.0 - %b.nnan = fadd nnan float %b, 2.0 - %c.nnan = fadd nnan float %c, 4.0 - - %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) - %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - - -; --------------------------------------------------------------------- -; Negative patterns -; --------------------------------------------------------------------- - -define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[4:5] -; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_f32_e32 v2, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm -; -; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v[0:1], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 -; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 -; GFX10-NEXT: v_max_f32_e32 v2, v1, v2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] -; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid - %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid - %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid - %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid - %a = load volatile float, ptr addrspace(1) %gep0 - %b = load volatile float, ptr addrspace(1) %gep1 - %c = load volatile float, ptr addrspace(1) %gep2 - %tmp0 = call float @llvm.minnum.f32(float %a, float %b) - store volatile float %tmp0, ptr addrspace(1) undef - %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) - %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) - %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) - store float %med3, ptr addrspace(1) %outgep - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #0 -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.minnum.f32(float, float) #0 -declare float @llvm.maxnum.f32(float, float) #0 -declare double @llvm.minnum.f64(double, double) #0 -declare double @llvm.maxnum.f64(double, double) #0 -declare half @llvm.fabs.f16(half) #0 -declare half @llvm.minnum.f16(half, half) #0 -declare half @llvm.maxnum.f16(half, half) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } -attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 49ecccd1135ca..40ebb191802a8 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -1,8 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; TODO: LLVM ERROR: cannot select: G_STORE %30:vgpr(s64), %22:vgpr(p1) +; RUN: not --crash llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s +; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL %s define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: @@ -22,23 +27,43 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; GFX9: ; %bb.0: @@ -96,23 +121,43 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_nnan_r_i_i_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; GFX9: ; %bb.0: @@ -171,23 +216,43 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; GFX9: ; %bb.0: @@ -246,23 +311,43 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; GFX9: ; %bb.0: @@ -322,24 +407,45 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, 4.0, v2 -; VI-NEXT: v_min_f32_e32 v2, 2.0, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, 4.0, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, 4.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; GFX9: ; %bb.0: @@ -403,62 +509,121 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_max_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_min_f32_e32 v3, 4.0, v2 -; VI-NEXT: flat_store_dword v[0:1], v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_max_f32_e32 v1, 2.0, v1 -; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_max_f32_e32 v1, 2.0, v1 -; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v1 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v3, 4.0, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v3 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v3, 2.0, v2 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_store_dword v[0:1], v3 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -492,24 +657,45 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_r_i_i_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 -; VI-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; VI-SDAG-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; VI-GISEL-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_r_i_i_f64: ; GFX9: ; %bb.0: @@ -569,22 +755,41 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; GFX9: ; %bb.0: @@ -639,50 +844,106 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_legacy_fmed3_r_i_i_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 -; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_legacy_fmed3_r_i_i_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_legacy_fmed3_r_i_i_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 2.0, vcc +; VI-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v2 +; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 4.0, vcc +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc +; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 2.0, v1 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo +; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -726,64 +987,131 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, -v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, -v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -826,64 +1154,131 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, -v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, -v2, v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, -v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, -v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -926,64 +1321,131 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, -v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, v1, v2, -v3 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, v1, v2, -v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, -v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -1026,64 +1488,134 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3| -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3| +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| +; VI-GISEL-NEXT: v_med3_f32 v2, v4, |v2|, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -1132,64 +1664,137 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3| -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3| +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7| +; VI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2| +; VI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| +; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2| +; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3| +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -1243,35 +1848,69 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_inputs_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -1356,32 +1995,63 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_input_calls_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -1455,32 +2125,63 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_call_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_call_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -1554,32 +2255,63 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_fast_call_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_fast_call_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -1665,32 +2397,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -1764,32 +2527,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat1: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: ; GFX9: ; %bb.0: @@ -1838,6 +2632,173 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o ret void } +define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid + %a = load volatile float, ptr addrspace(1) %gep0 + %b = load volatile float, ptr addrspace(1) %gep1 + %c = load volatile float, ptr addrspace(1) %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c) + %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2) + store float %med3, ptr addrspace(1) %outgep + ret void +} + define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-LABEL: v_test_global_nnans_med3_f32_pat2: ; SI: ; %bb.0: @@ -1863,32 +2824,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat2: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: ; GFX9: ; %bb.0: @@ -1962,32 +2954,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat3: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: ; GFX9: ; %bb.0: @@ -2061,32 +3084,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat4: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: ; GFX9: ; %bb.0: @@ -2160,32 +3214,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat5: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: ; GFX9: ; %bb.0: @@ -2259,32 +3344,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat6: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: ; GFX9: ; %bb.0: @@ -2358,32 +3474,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat7: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: ; GFX9: ; %bb.0: @@ -2457,32 +3604,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat8: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: ; GFX9: ; %bb.0: @@ -2556,32 +3734,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat9: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: ; GFX9: ; %bb.0: @@ -2655,32 +3864,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat10: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: ; GFX9: ; %bb.0: @@ -2754,32 +3994,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat11: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: ; GFX9: ; %bb.0: @@ -2853,32 +4124,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat12: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: ; GFX9: ; %bb.0: @@ -2952,32 +4254,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat13: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: ; GFX9: ; %bb.0: @@ -3051,32 +4384,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat14: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: ; GFX9: ; %bb.0: @@ -3150,32 +4514,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat15: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v2, v7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: ; GFX9: ; %bb.0: @@ -3252,32 +4647,63 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_med3_f32 v2, v7, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: ; GFX9: ; %bb.0: @@ -3364,40 +4790,79 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v5 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v5 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; GFX9: ; %bb.0: @@ -3495,40 +4960,79 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; GFX9: ; %bb.0: @@ -3553,29 +5057,52 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 -; GFX11-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 +; GFX11-SDAG-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v2, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3627,40 +5154,79 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; GFX9: ; %bb.0: @@ -3754,38 +5320,75 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_safe_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_load_dword v6, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; VI-NEXT: v_min_f32_e32 v5, v4, v2 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_safe_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -3808,26 +5411,47 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_safe_med3_f32_pat0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 -; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-SDAG-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -3872,35 +5496,69 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -3988,35 +5646,69 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -4104,35 +5796,69 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 -; VI-NEXT: v_med3_f32 v2, v4, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; GFX9: ; %bb.0: @@ -4192,6 +5918,173 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) ret void } +define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr float, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid + %a = load volatile float, ptr addrspace(1) %gep0 + %b = load volatile float, ptr addrspace(1) %gep1 + %c = load volatile float, ptr addrspace(1) %gep2 + %a.fneg = fsub float -0.0, %a + %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b) + %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b) + %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c) + %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2) + store float %med3, ptr addrspace(1) %outgep + ret void +} + define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { ; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; SI: ; %bb.0: @@ -4219,72 +6112,147 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_load_dword v6, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_min_f32_e64 v4, -v6, v2 -; VI-NEXT: v_max_f32_e32 v2, v6, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_min_f32_e64 v4, -v1, v2 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_minmax_f32 v1, -v1, v2, v3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_min_f32_e64 v4, -v6, v2 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7 +; VI-GISEL-NEXT: v_max_f32_e32 v5, v7, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v3, v5, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2 +; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_minmax_f32 v1, -v1, v2, v3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1 +; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -4329,33 +6297,65 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_global_nnans_min_max_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_dword v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_dword v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_max_f32_e32 v2, v7, v2 -; VI-NEXT: v_min_f32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_max_f32_e32 v2, v7, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_global_nnans_min_max_f32: ; GFX9: ; %bb.0: @@ -4423,24 +6423,45 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f16_e32 v2, 1.0, v3 -; VI-NEXT: v_max_f16_e32 v2, 2.0, v2 -; VI-NEXT: v_min_f16_e32 v2, 4.0, v2 -; VI-NEXT: flat_store_short v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1] +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f16_e32 v2, 1.0, v3 +; VI-SDAG-NEXT: v_max_f16_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_min_f16_e32 v2, 4.0, v2 +; VI-SDAG-NEXT: flat_store_short v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f16_e32 v2, 1.0, v3 +; VI-GISEL-NEXT: v_max_f16_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_min_f16_e32 v2, 4.0, v2 +; VI-GISEL-NEXT: flat_store_short v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; GFX9: ; %bb.0: @@ -4512,38 +6533,75 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt ; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: v_nnan_inputs_med3_f16_pat0: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_ushort v7, v[0:1] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_ushort v2, v[2:3] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_load_ushort v3, v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_f16_e32 v4, 1.0, v7 -; VI-NEXT: v_add_f16_e32 v2, 2.0, v2 -; VI-NEXT: v_add_f16_e32 v3, 4.0, v3 -; VI-NEXT: v_min_f16_e32 v5, v4, v2 -; VI-NEXT: v_max_f16_e32 v2, v4, v2 -; VI-NEXT: v_min_f16_e32 v2, v2, v3 -; VI-NEXT: v_max_f16_e32 v2, v5, v2 -; VI-NEXT: flat_store_short v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 +; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7 +; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-SDAG-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: v_add_f16_e32 v4, 1.0, v7 +; VI-SDAG-NEXT: v_add_f16_e32 v2, 2.0, v2 +; VI-SDAG-NEXT: v_add_f16_e32 v3, 4.0, v3 +; VI-SDAG-NEXT: v_min_f16_e32 v5, v4, v2 +; VI-SDAG-NEXT: v_max_f16_e32 v2, v4, v2 +; VI-SDAG-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-SDAG-NEXT: v_max_f16_e32 v2, v5, v2 +; VI-SDAG-NEXT: flat_store_short v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7 +; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-GISEL-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: v_add_f16_e32 v4, 1.0, v7 +; VI-GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 +; VI-GISEL-NEXT: v_add_f16_e32 v3, 4.0, v3 +; VI-GISEL-NEXT: v_min_f16_e32 v5, v4, v2 +; VI-GISEL-NEXT: v_max_f16_e32 v2, v4, v2 +; VI-GISEL-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-GISEL-NEXT: v_max_f16_e32 v2, v5, v2 +; VI-GISEL-NEXT: flat_store_short v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: ; GFX9: ; %bb.0: @@ -4623,24 +6681,45 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; -; VI-LABEL: two_non_inline_constant: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 -; VI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 -; VI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: two_non_inline_constant: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-SDAG-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; VI-SDAG-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: two_non_inline_constant: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-GISEL-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; VI-GISEL-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: two_non_inline_constant: ; GFX9: ; %bb.0: @@ -4655,21 +6734,37 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: two_non_inline_constant: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_mov_b32 s2, 0x41000000 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: two_non_inline_constant: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: two_non_inline_constant: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -4706,43 +6801,83 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: one_non_inline_constant: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 -; VI-NEXT: v_med3_f32 v2, v2, 1.0, v4 -; VI-NEXT: v_add_f32_e32 v3, 0x41800000, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: flat_store_dword v[0:1], v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: one_non_inline_constant: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 -; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 -; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-NEXT: global_store_dword v[0:1], v1, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; VI-SDAG-LABEL: one_non_inline_constant: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, 1.0, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 0x41800000, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v3 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: one_non_inline_constant: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x41800000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, 1.0, s2 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v3 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: one_non_inline_constant: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, 1.0, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: one_non_inline_constant: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_mov_b32 s2, 0x41800000 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX9-GISEL-NEXT: v_med3_f32 v2, v2, 1.0, s2 +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: one_non_inline_constant: ; GFX11: ; %bb.0: @@ -4803,31 +6938,59 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; -; VI-LABEL: two_non_inline_constant_multi_use: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_mov_b32 s2, 0x41000000 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 -; VI-NEXT: v_med3_f32 v2, v2, s2, v4 -; VI-NEXT: v_add_f32_e32 v5, 0x41800000, v3 -; VI-NEXT: v_add_f32_e32 v3, 0x41000000, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_endpgm +; VI-SDAG-LABEL: two_non_inline_constant_multi_use: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s2, 0x41000000 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-SDAG-NEXT: v_med3_f32 v2, v2, s2, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v5, 0x41800000, v3 +; VI-SDAG-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v2 +; VI-SDAG-NEXT: flat_store_dword v[0:1], v5 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: flat_store_dword v[0:1], v3 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: two_non_inline_constant_multi_use: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: s_mov_b32 s2, 0x41000000 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-GISEL-NEXT: v_med3_f32 v2, v2, s2, v4 +; VI-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v3 +; VI-GISEL-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v5 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: flat_store_dword v[0:1], v3 +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: s_endpgm ; ; GFX9-LABEL: two_non_inline_constant_multi_use: ; GFX9: ; %bb.0: @@ -4849,27 +7012,49 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: two_non_inline_constant_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_mov_b32 s2, 0x41000000 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v3, 0x41800000, v1 -; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 0x41000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_med3_f32 v2, v2, s2, 0x41800000 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 0x41800000, v1 +; GFX11-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_mov_b32 s2, 0x41800000 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1 +; GFX11-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_med3_f32 v2, v2, 0x41000000, s2 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid