diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 4145c8a54a6fe..dba5a8c0a7315 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8839,7 +8839,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags); } - if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + if (VT.isVector() && + (isOperationLegalOrCustomOrPromote(Opc, VT.getVectorElementType()) || + !isOperationLegalOrCustom(ISD::VSELECT, VT))) return DAG.UnrollVectorOp(Node); // If only one operand is NaN, override it with another operand. diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index 5a55fa97033b7..625c93c3f0a53 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -1586,54 +1586,25 @@ define double @minimumnum_double(double %a, double %b) { ret double %x } -; TODO Improve the "Expand" path for minimumnum vectors on targets where -; f16 is not supported. Ideally it should use two f32 minimumnums first instead of -; fully expanding the minimumnum instruction into compare/select instructions. define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: minimumnum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: min.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimumnum_v2half( @@ -1649,48 +1620,22 @@ define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: minimumnum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-SM80-NOF16-NEXT: min.f32 %r6, %r5, %r4; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1788,54 +1733,25 @@ define double @maximumnum_double(double %a, double %b) { ret double %x } -; TODO Improve the "Expand" path for maximumnum vectors on targets where -; f16 is not supported. Ideally it should use two f32 maximumnums first instead of -; fully expanding the maximumnum instruction into compare/select instructions. define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: maximumnum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: max.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximumnum_v2half( @@ -1851,48 +1767,22 @@ define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: maximumnum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-SM80-NOF16-NEXT: max.f32 %r6, %r5, %r4; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 54d82b0c1c929..c66473e9edd19 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -1756,263 +1756,131 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; ; AVX512-LABEL: test_fmaximumnum_v4f16: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $56, %rsp -; AVX512-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm2 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm3 -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm5[1,0] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm6[1,0] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm13 -; AVX512-NEXT: vcvtph2ps %xmm13, %xmm2 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12 -; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm2 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrlq $48, %xmm5, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrlq $48, %xmm6, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm11 -; AVX512-NEXT: vcvtph2ps %xmm11, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm15 -; AVX512-NEXT: vcvtph2ps %xmm15, %xmm7 -; AVX512-NEXT: vucomiss %xmm7, %xmm1 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm7, %xmm7 {%k1} -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm5[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm2 +; AVX512-NEXT: vcmpordss %xmm3, %xmm3, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm6[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm9 -; AVX512-NEXT: vcvtph2ps %xmm9, %xmm4 -; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm10 -; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm4 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm4 -; AVX512-NEXT: vucomiss %xmm4, %xmm4 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm4, %xmm4 {%k2} +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm8 -; AVX512-NEXT: vcvtph2ps %xmm8, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm1 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vpsrld $16, %xmm5, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrld $16, %xmm6, %xmm5 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512-NEXT: vucomiss %xmm5, %xmm5 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm6 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm5 -; AVX512-NEXT: vmovss %xmm5, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm5 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm5 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX512-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX512-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; AVX512-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512-NEXT: vpcmpeqw %xmm6, %xmm4, %xmm9 -; AVX512-NEXT: vpblendvb %xmm9, %xmm4, %xmm0, %xmm4 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload -; AVX512-NEXT: # xmm11 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; AVX512-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm6 -; AVX512-NEXT: vpblendvb %xmm6, %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovel %ecx, %edx -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovel %ecx, %esi -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: cmovel %ecx, %edi -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r8d -; AVX512-NEXT: cmovel %ecx, %r8d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r9d -; AVX512-NEXT: cmovel %ecx, %r9d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r10d -; AVX512-NEXT: cmovel %ecx, %r10d -; AVX512-NEXT: vcvtph2ps (%rsp), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r11d -; AVX512-NEXT: cmovel %ecx, %r11d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: vmovd %esi, %xmm2 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $2, %edi, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $3, %r8d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $4, %r9d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $6, %r11d, %xmm2, %xmm2 -; AVX512-NEXT: cmovel %ecx, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm5 +; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_v4f16: