-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[SelectionDAG] Improve v2f16 maximumnum expansion #160723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SelectionDAG] Improve v2f16 maximumnum expansion #160723
Conversation
On targets where f32 maximumnum is legal, but maximumnum on vectors of smaller types is not legal (e.g. v2f16), try unrolling the vector first as part of the expansion. Only fall back to expanding the full maximumnum computation into compares + selects if maximumnum on the scalar element type cannot be supported.
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-backend-nvptx Author: Lewis Crawford (LewisCrawford) ChangesOn targets where f32 maximumnum is legal, but maximumnum on vectors of smaller types is not legal (e.g. v2f16), try unrolling the vector first as part of the expansion. Only fall back to expanding the full maximumnum computation into compares + selects if maximumnum on the scalar element type cannot be supported. Patch is 32.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160723.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4145c8a54a6fe..dba5a8c0a7315 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8839,7 +8839,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags);
}
- if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+ if (VT.isVector() &&
+ (isOperationLegalOrCustomOrPromote(Opc, VT.getVectorElementType()) ||
+ !isOperationLegalOrCustom(ISD::VSELECT, VT)))
return DAG.UnrollVectorOp(Node);
// If only one operand is NaN, override it with another operand.
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index 5a55fa97033b7..625c93c3f0a53 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -1586,54 +1586,25 @@ define double @minimumnum_double(double %a, double %b) {
ret double %x
}
-; TODO Improve the "Expand" path for minimumnum vectors on targets where
-; f16 is not supported. Ideally it should use two f32 minimumnums first instead of
-; fully expanding the minimumnum instruction into compare/select instructions.
define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
; CHECK-NOF16-LABEL: minimumnum_v2half(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
-; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
-; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
-; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT: min.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimumnum_v2half(
@@ -1649,48 +1620,22 @@ define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
;
; CHECK-SM80-NOF16-LABEL: minimumnum_v2half(
; CHECK-SM80-NOF16: {
-; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-SM80-NOF16-EMPTY:
; CHECK-SM80-NOF16-NEXT: // %bb.0:
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r6, %r5, %r4;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-SM80-NOF16-NEXT: ret;
%x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %x
@@ -1788,54 +1733,25 @@ define double @maximumnum_double(double %a, double %b) {
ret double %x
}
-; TODO Improve the "Expand" path for maximumnum vectors on targets where
-; f16 is not supported. Ideally it should use two f32 maximumnums first instead of
-; fully expanding the maximumnum instruction into compare/select instructions.
define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
; CHECK-NOF16-LABEL: maximumnum_v2half(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
-; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
-; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
-; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT: max.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: maximumnum_v2half(
@@ -1851,48 +1767,22 @@ define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
;
; CHECK-SM80-NOF16-LABEL: maximumnum_v2half(
; CHECK-SM80-NOF16: {
-; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-SM80-NOF16-EMPTY:
; CHECK-SM80-NOF16-NEXT: // %bb.0:
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r6, %r5, %r4;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-SM80-NOF16-NEXT: ret;
%x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %x
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 54d82b0c1c929..c66473e9edd19 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1756,263 +1756,131 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
;
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $56, %rsp
-; AVX512-NEXT: vmovdqa %xmm1, %xmm5
-; AVX512-NEXT: vmovdqa %xmm0, %xmm6
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm1
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm1
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm2
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm5[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm6[1,0]
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm13
-; AVX512-NEXT: vcvtph2ps %xmm13, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12
-; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm2
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrlq $48, %xmm5, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrlq $48, %xmm6, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2p...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Lewis Crawford (LewisCrawford) ChangesOn targets where f32 maximumnum is legal, but maximumnum on vectors of smaller types is not legal (e.g. v2f16), try unrolling the vector first as part of the expansion. Only fall back to expanding the full maximumnum computation into compares + selects if maximumnum on the scalar element type cannot be supported. Patch is 32.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160723.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4145c8a54a6fe..dba5a8c0a7315 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8839,7 +8839,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags);
}
- if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+ if (VT.isVector() &&
+ (isOperationLegalOrCustomOrPromote(Opc, VT.getVectorElementType()) ||
+ !isOperationLegalOrCustom(ISD::VSELECT, VT)))
return DAG.UnrollVectorOp(Node);
// If only one operand is NaN, override it with another operand.
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index 5a55fa97033b7..625c93c3f0a53 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -1586,54 +1586,25 @@ define double @minimumnum_double(double %a, double %b) {
ret double %x
}
-; TODO Improve the "Expand" path for minimumnum vectors on targets where
-; f16 is not supported. Ideally it should use two f32 minimumnums first instead of
-; fully expanding the minimumnum instruction into compare/select instructions.
define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
; CHECK-NOF16-LABEL: minimumnum_v2half(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
-; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
-; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
-; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
-; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT: min.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimumnum_v2half(
@@ -1649,48 +1620,22 @@ define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) {
;
; CHECK-SM80-NOF16-LABEL: minimumnum_v2half(
; CHECK-SM80-NOF16: {
-; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-SM80-NOF16-EMPTY:
; CHECK-SM80-NOF16-NEXT: // %bb.0:
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0];
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1];
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-SM80-NOF16-NEXT: min.f32 %r6, %r5, %r4;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-SM80-NOF16-NEXT: ret;
%x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %x
@@ -1788,54 +1733,25 @@ define double @maximumnum_double(double %a, double %b) {
ret double %x
}
-; TODO Improve the "Expand" path for maximumnum vectors on targets where
-; f16 is not supported. Ideally it should use two f32 maximumnums first instead of
-; fully expanding the maximumnum instruction into compare/select instructions.
define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
; CHECK-NOF16-LABEL: maximumnum_v2half(
; CHECK-NOF16: {
-; CHECK-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
-; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
-; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
-; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
-; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT: max.f32 %r6, %r5, %r4;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: maximumnum_v2half(
@@ -1851,48 +1767,22 @@ define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) {
;
; CHECK-SM80-NOF16-LABEL: maximumnum_v2half(
; CHECK-SM80-NOF16: {
-; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>;
-; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>;
-; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>;
+; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>;
+; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-SM80-NOF16-EMPTY:
; CHECK-SM80-NOF16-NEXT: // %bb.0:
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0];
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1];
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6;
-; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3;
-; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12;
-; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11;
-; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13;
-; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000;
-; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12;
-; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10};
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3;
+; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1;
+; CHECK-SM80-NOF16-NEXT: max.f32 %r6, %r5, %r4;
+; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6;
+; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5};
+; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-SM80-NOF16-NEXT: ret;
%x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b)
ret <2 x half> %x
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 54d82b0c1c929..c66473e9edd19 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1756,263 +1756,131 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
;
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $56, %rsp
-; AVX512-NEXT: vmovdqa %xmm1, %xmm5
-; AVX512-NEXT: vmovdqa %xmm0, %xmm6
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm1
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm1
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm2
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm5[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm6[1,0]
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm13
-; AVX512-NEXT: vcvtph2ps %xmm13, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12
-; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm2
-; AVX512-NEXT: seta %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpsrlq $48, %xmm5, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrlq $48, %xmm6, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vucomiss %xmm1, %xmm1
-; AVX512-NEXT: setp %al
-; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
-; AVX512-NEXT: vcvtps2p...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM for the effect on NVPTX.
AVX changes look like an improvement, too, but I'm not that familiar with the details. We still need someone from the X86 backend to take a look.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
On targets where f32 maximumnum is legal, but maximumnum on vectors of smaller types is not legal (e.g. v2f16), try unrolling the vector first as part of the expansion. Only fall back to expanding the full maximumnum computation into compares + selects if maximumnum on the scalar element type cannot be supported.
On targets where f32 maximumnum is legal, but maximumnum on vectors of smaller types is not legal (e.g. v2f16), try unrolling the vector first as part of the expansion.
Only fall back to expanding the full maximumnum computation into compares + selects if maximumnum on the scalar element type cannot be supported.