diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index cec35d1147bb0..4297111930de2 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -574,11 +574,11 @@ class MUBUF_Store_Pseudo { - def _OFFSET : GCNPat < + def : GCNPat < (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)), (!cast(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>; - def _ADDR64 : GCNPat < + def : GCNPat < (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)), (!cast(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>; } @@ -912,10 +912,22 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>; defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>; -defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>; + +foreach vt = Reg32Types.types in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>; +} + +foreach vt = VReg_64.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>; +} + +foreach vt = VReg_96.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>; +} + +foreach vt = VReg_128.RegTypes in { +defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>; +} defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < "buffer_store_byte", i32 @@ -938,10 +950,22 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>; defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>; -defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", i32, store_global>; -defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", v2i32, store_global>; -defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", v3i32, store_global>; -defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", v4i32, store_global>; + +foreach vt = Reg32Types.types in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>; +} + +foreach vt = VReg_64.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>; +} + +foreach vt = VReg_96.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>; +} + +foreach vt = VReg_128.RegTypes in { +defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>; +} defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_swap", VGPR_32, i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll index 2263672adb945..7028d1157787f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -77,7 +77,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, ; ; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_lshr_b32 s0, s2, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 @@ -150,7 +153,10 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 @@ -360,7 +366,10 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) { ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -402,7 +411,10 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) { ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -447,7 +459,10 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) { ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -492,7 +507,10 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) { ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index 5b3cba2000de6..0417b97a53c0f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] ; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f64: @@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 @@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: fast_frem_f64: @@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1] ; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0 @@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] ; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] ; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3] -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: unsafe_frem_f64: @@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_or_b32_e32 v2, v0, v1 -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: @@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: @@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3] -; CI-NEXT: v_mov_b32_e32 v4, s4 -; CI-NEXT: v_mov_b32_e32 v5, s5 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index d2b2aef8077f9..d6957be8ab8ff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -50,11 +50,12 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_or_b32 s2, s0, s2 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_s_s: @@ -135,19 +136,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre ; ; GFX7-LABEL: insertelement_v_v2i16_s_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s1, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_s: @@ -228,14 +231,15 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: s_and_b32 s1, s4, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s0, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_v_s: @@ -318,15 +322,16 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_and_b32 s1, s4, 0xffff -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_s_v: @@ -410,15 +415,16 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1 -; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v2i16_v_v: @@ -499,19 +505,21 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre ; ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_s_v: @@ -590,19 +598,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val ; ; GFX7-LABEL: insertelement_v_v2i16_v_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX7-NEXT: s_lshl_b32 s0, s0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_s: @@ -681,19 +691,21 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val ; ; GFX7-LABEL: insertelement_v_v2i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v2i16_v_v: @@ -842,7 +854,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; ; GFX7-LABEL: insertelement_v_v4i16_s_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_and_b32 s1, s3, 1 ; GFX7-NEXT: s_lshr_b32 s0, s3, 1 ; GFX7-NEXT: s_and_b32 s2, s2, 0xffff @@ -851,16 +866,15 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s1, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_s_s: @@ -980,15 +994,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_v_s: @@ -1119,15 +1134,16 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_not_b32_e32 v0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_s_v: @@ -1258,15 +1274,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX7-NEXT: v_not_b32_e32 v1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_s_v4i16_v_v: @@ -1376,25 +1393,27 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre ; ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_not_b32_e32 v2, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_s_v: @@ -1498,7 +1517,10 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; ; GFX7-LABEL: insertelement_v_v4i16_v_s: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: s_lshr_b32 s0, s2, 1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -1507,16 +1529,15 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val ; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_not_b32 s1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_s: @@ -1619,25 +1640,27 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val ; ; GFX7-LABEL: insertelement_v_v4i16_v_v: ; GFX7: ; %bb.0: -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_not_b32_e32 v3, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX7-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: insertelement_v_v4i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir index ea5a0c5aceb5d..f26e23293dae7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir @@ -31,6 +31,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_4 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -42,30 +43,35 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_global_s32_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_global_s32_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_global_s32_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_global_s32_from_4 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -100,6 +106,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_2 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -111,30 +118,35 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_2 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX8-LABEL: name: load_global_s32_from_2 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] + ; ; GFX9-LABEL: name: load_global_s32_from_2 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] + ; ; GFX10-LABEL: name: load_global_s32_from_2 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] + ; ; GFX11-LABEL: name: load_global_s32_from_2 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -169,6 +181,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -180,30 +193,35 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -238,6 +256,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v2s32 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -249,30 +268,35 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v2s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_global_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_global_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_global_v2s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_global_v2s32 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -307,6 +331,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v4s32 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -318,30 +343,35 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v4s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_global_v4s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_global_v4s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_global_v4s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_global_v4s32 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -368,39 +398,55 @@ body: | ; GFX6-LABEL: name: load_global_s64 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_global_s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_global_s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_global_s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_global_s64 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -427,39 +473,55 @@ body: | ; GFX6-LABEL: name: load_global_v2s64 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) + ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v2s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v2s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_global_v2s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_global_v2s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_global_v2s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_global_v2s64 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -489,36 +551,42 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX7-LABEL: name: load_global_v2p1 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX7-FLAT-LABEL: name: load_global_v2p1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX8-LABEL: name: load_global_v2p1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX9-LABEL: name: load_global_v2p1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX10-LABEL: name: load_global_v2p1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4, addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>) + ; ; GFX11-LABEL: name: load_global_v2p1 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -548,36 +616,42 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX7-LABEL: name: load_global_s128 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX7-FLAT-LABEL: name: load_global_s128 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX8-LABEL: name: load_global_s128 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX9-LABEL: name: load_global_s128 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX10-LABEL: name: load_global_s128 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; ; GFX11-LABEL: name: load_global_s128 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -604,39 +678,55 @@ body: | ; GFX6-LABEL: name: load_global_p3_from_4 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_p3_from_4 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_p3_from_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_global_p3_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_global_p3_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_global_p3_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_global_p3_from_4 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -663,39 +753,55 @@ body: | ; GFX6-LABEL: name: load_global_p1_from_8 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_p1_from_8 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_p1_from_8 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_global_p1_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_global_p1_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_global_p1_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_global_p1_from_8 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -725,36 +831,42 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX7-LABEL: name: load_global_p999_from_8 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX7-FLAT-LABEL: name: load_global_p999_from_8 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX8-LABEL: name: load_global_p999_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX9-LABEL: name: load_global_p999_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX10-LABEL: name: load_global_p999_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; ; GFX11-LABEL: name: load_global_p999_from_8 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -784,36 +896,42 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX7-LABEL: name: load_global_v2p3 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX7-FLAT-LABEL: name: load_global_v2p3 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX8-LABEL: name: load_global_v2p3 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX9-LABEL: name: load_global_v2p3 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX10-LABEL: name: load_global_v2p3 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; ; GFX11-LABEL: name: load_global_v2p3 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -840,39 +958,55 @@ body: | ; GFX6-LABEL: name: load_global_v2s16 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1) - ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) + ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v2s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) + ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v2s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX8-LABEL: name: load_global_v2s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] + ; ; GFX9-LABEL: name: load_global_v2s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX10-LABEL: name: load_global_v2s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] + ; ; GFX11-LABEL: name: load_global_v2s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -899,39 +1033,55 @@ body: | ; GFX6-LABEL: name: load_global_v4s16 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) + ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v4s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v4s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX8-LABEL: name: load_global_v4s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] + ; ; GFX9-LABEL: name: load_global_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX10-LABEL: name: load_global_v4s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] + ; ; GFX11-LABEL: name: load_global_v4s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -958,39 +1108,55 @@ body: | ; GFX6-LABEL: name: load_global_v8s16 ; GFX6: liveins: $vgpr0_vgpr1 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_v8s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_v8s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX8-LABEL: name: load_global_v8s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] + ; ; GFX9-LABEL: name: load_global_v8s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX10-LABEL: name: load_global_v8s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; ; GFX11-LABEL: name: load_global_v8s16 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1029,6 +1195,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1040,6 +1207,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1056,6 +1224,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1072,18 +1241,21 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1120,6 +1292,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1131,6 +1304,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1147,6 +1321,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1163,12 +1338,14 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1185,6 +1362,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1231,6 +1409,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1252,6 +1431,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1268,6 +1448,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1284,18 +1465,21 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1342,6 +1526,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1363,6 +1548,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1379,6 +1565,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1395,18 +1582,21 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1443,6 +1633,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1454,6 +1645,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1470,6 +1662,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1486,12 +1679,14 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1508,6 +1703,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1545,6 +1741,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1557,6 +1754,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1573,6 +1771,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1589,6 +1788,7 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -1605,6 +1805,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1621,6 +1822,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1677,6 +1879,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1698,6 +1901,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1714,6 +1918,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1730,12 +1935,14 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1752,6 +1959,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1798,6 +2006,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1819,6 +2028,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1835,6 +2045,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1851,12 +2062,14 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1873,6 +2086,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -1910,6 +2124,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -1922,6 +2137,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -1938,6 +2154,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -1954,6 +2171,7 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -1970,6 +2188,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -1986,6 +2205,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -2033,6 +2253,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -2045,6 +2266,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -2061,6 +2283,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -2077,6 +2300,7 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2093,6 +2317,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2109,6 +2334,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -2165,6 +2391,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -2186,6 +2413,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -2202,6 +2430,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -2218,6 +2447,7 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2234,6 +2464,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2250,6 +2481,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} @@ -2306,6 +2538,7 @@ body: | ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} @@ -2327,6 +2560,7 @@ body: | ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] + ; ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} @@ -2343,6 +2577,7 @@ body: | ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8-NEXT: {{ $}} @@ -2359,6 +2594,7 @@ body: | ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] + ; ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} @@ -2375,6 +2611,7 @@ body: | ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2391,6 +2628,7 @@ body: | ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] + ; ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir index f36cb1d978214..c56ba70b667d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -29,6 +29,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s32_to_4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -40,24 +41,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s32_to_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s32_to_4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s32_to_4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s32_to_4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -91,6 +96,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s32_to_2 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -102,24 +108,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s16), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s32_to_2 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s32_to_2 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s32_to_2 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s16), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s32_to_2 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -153,6 +163,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s32_to_1 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -164,24 +175,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s32_to_1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s32_to_1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s32_to_1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s8), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s32_to_1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -208,33 +223,48 @@ body: | ; GFX6-LABEL: name: store_global_s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (s64), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -263,30 +293,35 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s128 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s128 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s128 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s128 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s128 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -321,6 +356,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v2s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} @@ -332,24 +368,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v2s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v2s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s32>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -383,6 +423,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v4s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} @@ -394,24 +435,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v4s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v4s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v4s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s32>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v4s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -438,33 +483,48 @@ body: | ; GFX6-LABEL: name: store_global_v2s16 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2 - ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s16>), [[COPY]](p1) :: (store (<2 x s16>), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v2s16 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v2s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v2s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v2s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s16>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v2s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -491,33 +551,48 @@ body: | ; GFX6-LABEL: name: store_global_v4s16 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX6-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v4s16 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v4s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v4s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v4s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<4 x s16>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v4s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -544,33 +619,48 @@ body: | ; GFX6-LABEL: name: store_global_v8s16 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX6-NEXT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store (<8 x s16>), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v8s16 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v8s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v8s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v8s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<8 x s16>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v8s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -597,33 +687,48 @@ body: | ; GFX6-LABEL: name: store_global_v2s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX6-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v2s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v2s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v2s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v2s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (<2 x s64>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v2s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -650,33 +755,48 @@ body: | ; GFX6-LABEL: name: store_global_p1 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3 - ; GFX6-NEXT: G_STORE [[COPY1]](p1), [[COPY]](p1) :: (store (p1), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1) + ; ; GFX7-LABEL: name: store_global_p1 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p1), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_p1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1) + ; ; GFX8-LABEL: name: store_global_p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1), addrspace 1) + ; ; GFX9-LABEL: name: store_global_p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p1), addrspace 1) + ; ; GFX10-LABEL: name: store_global_p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -706,30 +826,35 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v2p1 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v2p1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v2p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v2p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v2p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -756,33 +881,48 @@ body: | ; GFX6-LABEL: name: store_global_p3 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6-NEXT: {{ $}} - ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1) + ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1) + ; ; GFX7-LABEL: name: store_global_p3 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (p3), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_p3 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1) + ; ; GFX8-LABEL: name: store_global_p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3), addrspace 1) + ; ; GFX9-LABEL: name: store_global_p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store (p3), addrspace 1) + ; ; GFX10-LABEL: name: store_global_p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -812,30 +952,35 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX6-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; ; GFX7-LABEL: name: store_global_v2p3 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_v2p3 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; ; GFX8-LABEL: name: store_global_v2p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; ; GFX9-LABEL: name: store_global_v2p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3 ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1) + ; ; GFX10-LABEL: name: store_global_v2p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -869,6 +1014,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) + ; ; GFX7-LABEL: name: store_atomic_global_s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -880,24 +1026,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_atomic_global_s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1) + ; ; GFX8-LABEL: name: store_atomic_global_s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1) + ; ; GFX9-LABEL: name: store_atomic_global_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1) + ; ; GFX10-LABEL: name: store_atomic_global_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -932,6 +1082,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) + ; ; GFX7-LABEL: name: store_atomic_global_s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} @@ -943,24 +1094,28 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_atomic_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: {{ $}} ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1) + ; ; GFX8-LABEL: name: store_atomic_global_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1) + ; ; GFX9-LABEL: name: store_atomic_global_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1) + ; ; GFX10-LABEL: name: store_atomic_global_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -995,6 +1150,7 @@ body: | ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX7-LABEL: name: store_global_s32_gep_2047 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-NEXT: {{ $}} @@ -1006,6 +1162,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT-NEXT: {{ $}} @@ -1022,6 +1179,7 @@ body: | ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; ; GFX8-LABEL: name: store_global_s32_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} @@ -1038,12 +1196,14 @@ body: | ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1) + ; ; GFX9-LABEL: name: store_global_s32_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (store (s32), addrspace 1) + ; ; GFX10-LABEL: name: store_global_s32_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 7160f03d2c3d6..a5482bd5b79a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -748,11 +748,11 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index a325288ca1601..13f7885207105 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -174,24 +174,21 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_1: @@ -263,24 +260,21 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) { ; GFX7-LABEL: test_div_scale_f64_2: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[2:3], v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_2: @@ -649,19 +643,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out ; GFX7-LABEL: test_div_scale_f64_scalar_num_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[8:9] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_1: @@ -724,19 +718,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out ; GFX7-LABEL: test_div_scale_f64_scalar_num_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], v[0:1], s[8:9] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_num_2: @@ -799,19 +793,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out ; GFX7-LABEL: test_div_scale_f64_scalar_den_1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[8:9], s[8:9], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_1: @@ -874,19 +868,19 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out ; GFX7-LABEL: test_div_scale_f64_scalar_den_2: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x15 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[8:9], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_scalar_den_2: @@ -1071,9 +1065,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_1: @@ -1131,9 +1125,9 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_all_scalar_2: @@ -1644,14 +1638,14 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) % define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 { ; GFX7-LABEL: test_div_scale_f64_val_undef_val: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s3, 0x40200000 ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f64_val_undef_val: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 8506efc1d9786..043e69abaeef2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -31,9 +31,9 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 store i64 %tmp, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 469e6ddad0a07..c28d204784d4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -4,8 +4,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s -; FIXME: -; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -112,6 +111,56 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v3 ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v6, v5 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 +; GFX6-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 +; GFX6-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX6-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:5 +; GFX6-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:7 +; GFX6-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX6-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:9 +; GFX6-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11 +; GFX6-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX6-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(11) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX6-NEXT: s_waitcnt vmcnt(10) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX6-NEXT: s_waitcnt vmcnt(9) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX6-NEXT: s_waitcnt vmcnt(8) +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GFX6-NEXT: s_waitcnt vmcnt(7) +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX6-NEXT: s_waitcnt vmcnt(6) +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX6-NEXT: s_waitcnt vmcnt(5) +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v8 +; GFX6-NEXT: s_waitcnt vmcnt(4) +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 24, v9 +; GFX6-NEXT: s_waitcnt vmcnt(3) +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_or_b32_e32 v3, v4, v12 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v9 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1 ret <3 x i32> %load } @@ -176,6 +225,32 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) { ; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 +; GFX6-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:6 +; GFX6-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:10 +; GFX6-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX6-NEXT: buffer_load_ushort v7, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(5) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX6-NEXT: s_waitcnt vmcnt(4) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: s_waitcnt vmcnt(3) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2 ret <3 x i32> %load } @@ -197,6 +272,20 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) { ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4 ret <3 x i32> %load } @@ -218,6 +307,20 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) { ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_i96_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i96, ptr addrspace(4) %ptr, align 8 ret i96 %load } @@ -239,6 +342,20 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) { ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[3:4], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8 ret <3 x i32> %load } @@ -266,6 +383,23 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) { ; GFX7-NEXT: v_mov_b32_e32 v2, v7 ; GFX7-NEXT: v_mov_b32_e32 v4, v8 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v6i16_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, v6 +; GFX6-NEXT: v_mov_b32_e32 v2, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8 ret <6 x i16> %load } @@ -313,6 +447,29 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) { ; GFX7-NEXT: v_mov_b32_e32 v1, v13 ; GFX7-NEXT: v_mov_b32_e32 v2, v12 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v12i8_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx2 v[12:13], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dword v8, v[0:1], s[4:7], 0 addr64 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v13 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v13 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 24, v8 +; GFX6-NEXT: v_mov_b32_e32 v0, v12 +; GFX6-NEXT: v_mov_b32_e32 v4, v13 +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8 ret <12 x i8> %load } @@ -334,6 +491,16 @@ define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) { ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_constant_v3i32_align16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16 ret <3 x i32> %load } @@ -451,6 +618,57 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1 +; GFX6-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:3 +; GFX6-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2 +; GFX6-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:5 +; GFX6-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:7 +; GFX6-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:6 +; GFX6-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:9 +; GFX6-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11 +; GFX6-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:10 +; GFX6-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(11) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: s_waitcnt vmcnt(10) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: s_waitcnt vmcnt(9) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt vmcnt(8) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: s_waitcnt vmcnt(7) +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX6-NEXT: s_waitcnt vmcnt(6) +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX6-NEXT: s_waitcnt vmcnt(5) +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX6-NEXT: s_waitcnt vmcnt(4) +; GFX6-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX6-NEXT: s_waitcnt vmcnt(3) +; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_or_b32_e32 v2, v3, v10 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v4, v6, v11 +; GFX6-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1 ret <3 x i32> %load } @@ -523,6 +741,33 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 ; GFX7-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 +; GFX6-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:6 +; GFX6-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:10 +; GFX6-NEXT: buffer_load_ushort v3, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4 +; GFX6-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8 +; GFX6-NEXT: s_waitcnt vmcnt(5) +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: s_waitcnt vmcnt(4) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: s_waitcnt vmcnt(3) +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt vmcnt(2) +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: s_waitcnt vmcnt(1) +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v2 +; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2 ret <3 x i32> %load } @@ -545,6 +790,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4 ret <3 x i32> %load } @@ -567,6 +821,15 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_i96_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: ; return to shader part epilog %load = load i96, ptr addrspace(4) %ptr, align 8 ret i96 %load } @@ -589,6 +852,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: ; return to shader part epilog %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8 ret <3 x i32> %load } @@ -611,6 +883,15 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg ; GFX7-NEXT: s_mov_b32 s0, s4 ; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v6i16_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: ; return to shader part epilog %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8 %cast = bitcast <6 x i16> %load to <3 x i32> ret <3 x i32> %cast @@ -652,6 +933,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg ; GFX7-NEXT: s_mov_b32 s0, s12 ; GFX7-NEXT: s_mov_b32 s4, s13 ; GFX7-NEXT: ; return to shader part epilog +; +; GFX6-LABEL: s_load_constant_v12i8_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x0 +; GFX6-NEXT: s_load_dword s8, s[0:1], 0x2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b32 s1, s12, 8 +; GFX6-NEXT: s_lshr_b32 s2, s12, 16 +; GFX6-NEXT: s_lshr_b32 s3, s12, 24 +; GFX6-NEXT: s_lshr_b32 s5, s13, 8 +; GFX6-NEXT: s_lshr_b32 s6, s13, 16 +; GFX6-NEXT: s_lshr_b32 s7, s13, 24 +; GFX6-NEXT: s_lshr_b32 s9, s8, 8 +; GFX6-NEXT: s_lshr_b32 s10, s8, 16 +; GFX6-NEXT: s_lshr_b32 s11, s8, 24 +; GFX6-NEXT: s_mov_b32 s0, s12 +; GFX6-NEXT: s_mov_b32 s4, s13 +; GFX6-NEXT: ; return to shader part epilog %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8 ret <12 x i8> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 50f143dc448c3..7ad19a4797003 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -329,16 +329,13 @@ define amdgpu_kernel void @muli24_shl64(ptr addrspace(1) nocapture %arg, ptr add ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 -; GFX7-NEXT: v_lshl_b64 v[0:1], v[1:2], 3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v5 -; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX7-NEXT: v_lshl_b64 v[3:4], v[1:2], 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX7-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: muli24_shl64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll index 9bc6189503887..c040c912b9ff7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll @@ -1,5 +1,4 @@ -; FIXME: Need to add support for mubuf stores to enable this on SI. -; XUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=SI,GCN %s ; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=CI,GCN,SICIVI %s ; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefixes=VI,GCN,SICIVI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding -verify-machineinstrs -global-isel < %s | FileCheck --check-prefixes=GFX9_10,GCN,VIGFX9_10,SIVIGFX9_10 %s diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 40ebb191802a8..70d915df7cb00 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s -; TODO: LLVM ERROR: cannot select: G_STORE %30:vgpr(s64), %22:vgpr(p1) -; RUN: not --crash llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-SDAG %s +; RUN: llc -march=amdgcn -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-GISEL %s ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s ; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s @@ -10,22 +9,39 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL %s define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: @@ -104,22 +120,39 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_nnan_r_i_i_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32: ; VI-SDAG: ; %bb.0: @@ -199,22 +232,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt } define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: ; VI-SDAG: ; %bb.0: @@ -294,22 +344,39 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) } define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: ; VI-SDAG: ; %bb.0: @@ -389,23 +456,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) } define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_min_f32_e32 v2, 2.0, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 2.0, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-GISEL-NEXT: v_min_f32_e32 v2, 2.0, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: ; VI-SDAG: ; %bb.0: @@ -488,26 +573,47 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp } define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, 2.0, v2 -; SI-NEXT: v_min_f32_e32 v3, 4.0, v2 -; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v3, 4.0, v2 +; SI-SDAG-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v3, 2.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: ; VI-SDAG: ; %bb.0: @@ -639,23 +745,41 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1 } define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_fmed3_r_i_i_f64: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; SI-NEXT: v_max_f64 v[2:3], v[2:3], 2.0 -; SI-NEXT: v_min_f64 v[2:3], v[2:3], 4.0 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], 2.0 +; SI-SDAG-NEXT: v_min_f64 v[2:3], v[2:3], 4.0 +; SI-SDAG-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], 2.0 +; SI-GISEL-NEXT: v_min_f64 v[2:3], v[2:3], 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64: ; VI-SDAG: ; %bb.0: @@ -739,21 +863,37 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add } define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { -; SI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32: ; VI-SDAG: ; %bb.0: @@ -827,22 +967,40 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, } define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_legacy_fmed3_r_i_i_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_max_legacy_f32_e64 v2, v2, 2.0 +; SI-GISEL-NEXT: v_min_legacy_f32_e64 v2, v2, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32: ; VI-SDAG: ; %bb.0: @@ -963,29 +1121,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: @@ -1130,29 +1311,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, -v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, -v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: ; VI-SDAG: ; %bb.0: @@ -1297,29 +1501,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, -v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, -v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v4 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: ; VI-SDAG: ; %bb.0: @@ -1464,29 +1691,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, -v2, |v3|, -|v4| -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, -v2, |v3|, -|v4| +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4| +; SI-GISEL-NEXT: v_med3_f32 v2, v2, |v3|, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: ; VI-SDAG: ; %bb.0: @@ -1640,29 +1891,54 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, -|v2|, -|v3|, -|v4| -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, -|v2|, -|v3|, -|v4| +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e64 v2, -1.0, |v2| +; SI-GISEL-NEXT: v_mul_f32_e64 v3, -1.0, |v3| +; SI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v4| +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: ; VI-SDAG: ; %bb.0: @@ -1821,32 +2097,57 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs } define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -1971,29 +2272,51 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt } define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_input_calls_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -2101,29 +2424,51 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou } define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_call_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_call_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -2231,29 +2576,51 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr } define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_fast_call_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_fast_call_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -2373,29 +2740,51 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr ; + commute outermost max define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -2503,29 +2892,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat1: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1: ; VI-SDAG: ; %bb.0: @@ -2633,29 +3044,52 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0: ; VI-SDAG: ; %bb.0: @@ -2800,29 +3234,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat2: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2: ; VI-SDAG: ; %bb.0: @@ -2930,29 +3386,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat3: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3: ; VI-SDAG: ; %bb.0: @@ -3060,29 +3538,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat4: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4: ; VI-SDAG: ; %bb.0: @@ -3190,29 +3690,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat5: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5: ; VI-SDAG: ; %bb.0: @@ -3320,29 +3842,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat6: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6: ; VI-SDAG: ; %bb.0: @@ -3450,29 +3994,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat7: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7: ; VI-SDAG: ; %bb.0: @@ -3580,29 +4146,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat8: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8: ; VI-SDAG: ; %bb.0: @@ -3710,29 +4298,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat9: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9: ; VI-SDAG: ; %bb.0: @@ -3840,29 +4450,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat10: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10: ; VI-SDAG: ; %bb.0: @@ -3970,29 +4602,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) % } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat11: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11: ; VI-SDAG: ; %bb.0: @@ -4100,29 +4754,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) % } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat12: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12: ; VI-SDAG: ; %bb.0: @@ -4230,29 +4906,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) % } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat13: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13: ; VI-SDAG: ; %bb.0: @@ -4360,29 +5058,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) % } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat14: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14: ; VI-SDAG: ; %bb.0: @@ -4490,29 +5210,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) % } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat15: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v3, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15: ; VI-SDAG: ; %bb.0: @@ -4623,29 +5365,51 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) % ; min(max(x, y), max(min(x, y), z)) define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16: ; VI-SDAG: ; %bb.0: @@ -4757,38 +5521,70 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) % ; --------------------------------------------------------------------- define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s10, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: buffer_store_dword v5, off, s[8:11], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, s11 +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: buffer_store_dword v5, off, s[0:3], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0: ; VI-SDAG: ; %bb.0: @@ -4927,38 +5723,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) } define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s10, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, s11 +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1: ; VI-SDAG: ; %bb.0: @@ -5121,38 +5949,70 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) } define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s10, -1 +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_mov_b32 s3, s11 +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2: ; VI-SDAG: ; %bb.0: @@ -5291,34 +6151,62 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) } define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_safe_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v5, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_min_f32_e32 v2, v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-SDAG-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_safe_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -5469,32 +6357,57 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr } define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -5619,32 +6532,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) } define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -5769,32 +6707,57 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) } define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: ; VI-SDAG: ; %bb.0: @@ -5919,29 +6882,52 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) } define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; SI-GISEL-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0: ; VI-SDAG: ; %bb.0: @@ -6086,31 +7072,57 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa } define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] -; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_min_f32_e64 v5, -v2, v3 -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: v_max_f32_e32 v2, v5, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_min_f32_e64 v5, -v2, v3 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_mul_f32_e32 v5, -1.0, v2 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v5, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: ; VI-SDAG: ; %bb.0: @@ -6272,30 +7284,53 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt ; A simple min and max is not sufficient define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { -; SI-LABEL: v_test_global_nnans_min_max_f32: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_max_f32_e32 v2, v2, v3 -; SI-NEXT: v_min_f32_e32 v2, v2, v4 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_global_nnans_min_max_f32: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_global_nnans_min_max_f32: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32: ; VI-SDAG: ; %bb.0: @@ -6404,24 +7439,51 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out } define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: ; VI-SDAG: ; %bb.0: @@ -6502,36 +7564,83 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o } define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { -; SI-LABEL: v_nnan_inputs_med3_f16_pat0: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[14:15], s[10:11] -; SI-NEXT: s_mov_b64 s[18:19], s[10:11] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] -; SI-NEXT: s_mov_b64 s[12:13], s[4:5] -; SI-NEXT: s_mov_b64 s[16:17], s[6:7] -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 -; SI-NEXT: v_med3_f32 v2, v2, v3, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s11, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s10, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-SDAG-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-SDAG-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-SDAG-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-SDAG-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-SDAG-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-SDAG-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-SDAG-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s10, 0 +; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0 +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5] +; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-GISEL-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v5 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, v3, v2 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0: ; VI-SDAG: ; %bb.0: @@ -6663,23 +7772,41 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt } define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: two_non_inline_constant: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0.5, v2 -; SI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 -; SI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 -; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: two_non_inline_constant: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v2 +; SI-SDAG-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; SI-SDAG-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; SI-SDAG-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: two_non_inline_constant: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v2 +; SI-GISEL-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; SI-GISEL-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: two_non_inline_constant: ; VI-SDAG: ; %bb.0: @@ -6779,27 +7906,49 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: one_non_inline_constant: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x41800000, v2 -; SI-NEXT: v_med3_f32 v3, v4, 1.0, v3 -; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: one_non_inline_constant: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v2, 0x41800000, v2 +; SI-SDAG-NEXT: v_med3_f32 v3, v4, 1.0, v3 +; SI-SDAG-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: one_non_inline_constant: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_mov_b32 s4, 0x41800000 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v3, 0.5, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x41800000, v2 +; SI-GISEL-NEXT: v_med3_f32 v3, v3, 1.0, s4 +; SI-GISEL-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: one_non_inline_constant: ; VI-SDAG: ; %bb.0: @@ -6912,31 +8061,57 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad } define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { -; SI-LABEL: two_non_inline_constant_multi_use: -; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s4, 0x41000000 -; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x41800000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x41000000, v2 -; SI-NEXT: v_med3_f32 v3, v4, s4, v3 -; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_endpgm +; SI-SDAG-LABEL: two_non_inline_constant_multi_use: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-SDAG-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-SDAG-NEXT: s_mov_b32 s4, 0x41000000 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-SDAG-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v5, 0x41800000, v2 +; SI-SDAG-NEXT: v_add_f32_e32 v2, 0x41000000, v2 +; SI-SDAG-NEXT: v_med3_f32 v3, v4, s4, v3 +; SI-SDAG-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-SDAG-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: two_non_inline_constant_multi_use: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-GISEL-NEXT: s_mov_b32 s4, 0x41000000 +; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-GISEL-NEXT: s_mov_b32 s6, -1 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v2 +; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x41000000, v2 +; SI-GISEL-NEXT: v_med3_f32 v3, v4, s4, v3 +; SI-GISEL-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-GISEL-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) +; SI-GISEL-NEXT: s_endpgm ; ; VI-SDAG-LABEL: two_non_inline_constant_multi_use: ; VI-SDAG: ; %bb.0: @@ -7085,3 +8260,5 @@ declare half @llvm.maxnum.f16(half, half) #0 attributes #0 = { nounwind readnone } attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index a8709c8a9a7c0..c5a430823ecf3 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=0 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s -; TODO: Crashes on selecting G_STORE. -; RUN: not --crash llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -global-isel=1 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s @@ -28,6 +27,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_f32_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -150,6 +161,19 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_f64_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -282,6 +306,21 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_v2f32_to_v2f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s4 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s5 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -311,10 +350,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-GISEL-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; ; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16: @@ -345,9 +384,10 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: @@ -381,10 +421,11 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -421,6 +462,23 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -448,15 +506,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-GISEL-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: @@ -486,14 +544,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: @@ -528,6 +587,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] @@ -536,8 +597,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -569,6 +629,18 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fneg_fptrunc_f32_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s3 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -691,6 +763,18 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fabs_fptrunc_f32_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s3| +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -813,6 +897,18 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s3| +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -936,6 +1032,18 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -1063,6 +1171,18 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s3| +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 @@ -1192,6 +1312,19 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; +; SI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; ; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 5ec9284c870c1..a8cbb0000ce72 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -131,9 +131,10 @@ define amdgpu_kernel void @fptrunc( ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GISEL-NEXT: s_mov_b32 s2, -1 +; GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) {