diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2290223a06f8e..89b697b2d5152 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4507,13 +4507,16 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, }; if (Op.getValueType() == MVT::bf16) { + unsigned MaxWidth = IsSigned + ? DAG.ComputeMaxSignificantBits(SrcVal) + : DAG.computeKnownBits(SrcVal).countMaxActiveBits(); // bf16 conversions are promoted to f32 when converting from i16. - if (DAG.ComputeMaxSignificantBits(SrcVal) <= 24) { + if (MaxWidth <= 24) { return IntToFpViaPromotion(MVT::f32); } // bf16 conversions are promoted to f64 when converting from i32. - if (DAG.ComputeMaxSignificantBits(SrcVal) <= 53) { + if (MaxWidth <= 53) { return IntToFpViaPromotion(MVT::f64); } @@ -19376,6 +19379,94 @@ static SDValue performBuildVectorCombine(SDNode *N, SDLoc DL(N); EVT VT = N->getValueType(0); + if (VT == MVT::v4f16 || VT == MVT::v4bf16) { + SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1), + Elt2 = N->getOperand(2), Elt3 = N->getOperand(3); + if (Elt0->getOpcode() == ISD::FP_ROUND && + Elt1->getOpcode() == ISD::FP_ROUND && + isa(Elt0->getOperand(1)) && + isa(Elt1->getOperand(1)) && + Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) && + Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + // Constant index. + isa(Elt0->getOperand(0)->getOperand(1)) && + isa(Elt1->getOperand(0)->getOperand(1)) && + Elt0->getOperand(0)->getOperand(0) == + Elt1->getOperand(0)->getOperand(0) && + Elt0->getOperand(0)->getConstantOperandVal(1) == 0 && + Elt1->getOperand(0)->getConstantOperandVal(1) == 1) { + SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0); + if (LowLanesSrcVec.getValueType() == MVT::v2f64) { + SDValue HighLanes; + if (Elt2->getOpcode() == ISD::UNDEF && + Elt3->getOpcode() == ISD::UNDEF) { + HighLanes = DAG.getUNDEF(MVT::v2f32); + } else if (Elt2->getOpcode() == ISD::FP_ROUND && + Elt3->getOpcode() == ISD::FP_ROUND && + isa(Elt2->getOperand(1)) && + isa(Elt3->getOperand(1)) && + Elt2->getConstantOperandVal(1) == + Elt3->getConstantOperandVal(1) && + Elt2->getOperand(0)->getOpcode() == + ISD::EXTRACT_VECTOR_ELT && + Elt3->getOperand(0)->getOpcode() == + ISD::EXTRACT_VECTOR_ELT && + // Constant index. + isa(Elt2->getOperand(0)->getOperand(1)) && + isa(Elt3->getOperand(0)->getOperand(1)) && + Elt2->getOperand(0)->getOperand(0) == + Elt3->getOperand(0)->getOperand(0) && + Elt2->getOperand(0)->getConstantOperandVal(1) == 0 && + Elt3->getOperand(0)->getConstantOperandVal(1) == 1) { + SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0); + HighLanes = + DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec); + } + if (HighLanes) { + SDValue DoubleToSingleSticky = + DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + DoubleToSingleSticky, HighLanes); + return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat, + Elt0->getOperand(1)); + } + } + } + } + + if (VT == MVT::v2f64) { + SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1); + if (Elt0->getOpcode() == ISD::FP_EXTEND && + Elt1->getOpcode() == ISD::FP_EXTEND && + Elt0->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Elt0->getOperand(0)->getOperand(0) == + Elt1->getOperand(0)->getOperand(0) && + // Constant index. + isa(Elt0->getOperand(0)->getOperand(1)) && + isa(Elt1->getOperand(0)->getOperand(1)) && + Elt0->getOperand(0)->getConstantOperandVal(1) + 1 == + Elt1->getOperand(0)->getConstantOperandVal(1) && + // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of + // ResultType's known minimum vector length. + Elt0->getOperand(0)->getConstantOperandVal(1) % + VT.getVectorMinNumElements() == + 0) { + SDValue SrcVec = Elt0->getOperand(0)->getOperand(0); + if (SrcVec.getValueType() == MVT::v4f16 || + SrcVec.getValueType() == MVT::v4bf16) { + SDValue HalfToSingle = + DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec); + SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1); + SDValue Extract = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32), + HalfToSingle, SubvectorIdx); + return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract); + } + } + } + // A build vector of two extracted elements is equivalent to an // extract subvector where the inner vector is any-extended to the // extract_vector_elt VT. diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 091db559a3370..8360bef8e2f82 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6832,7 +6832,7 @@ multiclass SIMDFPNarrowTwoVector opc, string asm> { } multiclass SIMDFPInexactCvtTwoVector opc, string asm, - Intrinsic OpNode> { + SDPatternOperator OpNode> { def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, asm, ".2s", ".2d", [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>; @@ -7547,7 +7547,7 @@ class BaseSIMDCmpTwoScalar size, bits<2> size2, bits<5> opcode, let mayRaiseFPException = 1, Uses = [FPCR] in class SIMDInexactCvtTwoScalar opcode, string asm> : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", - [(set (f32 FPR32:$Rd), (AArch64fcvtxn (f64 FPR64:$Rn)))]>, + [(set (f32 FPR32:$Rd), (AArch64fcvtxnsdr (f64 FPR64:$Rn)))]>, Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f8c6d9019ef6e..3c67f616c1b9c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -757,9 +757,12 @@ def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>; def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>; def AArch64fcvtxn_n: SDNode<"AArch64ISD::FCVTXN", SDTFPRoundOp>; -def AArch64fcvtxn: PatFrags<(ops node:$Rn), - [(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))), - (f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>; +def AArch64fcvtxnsdr: PatFrags<(ops node:$Rn), + [(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))), + (f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>; +def AArch64fcvtxnv: PatFrags<(ops node:$Rn), + [(int_aarch64_neon_fcvtxn node:$Rn), + (AArch64fcvtxn_n node:$Rn)]>; def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>; def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>; @@ -5042,7 +5045,7 @@ def : Pat<(concat_vectors V64:$Rd, (v4f16 (any_fpround (v4f32 V128:$Rn)))), defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", - int_aarch64_neon_fcvtxn>; + AArch64fcvtxnv>; defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index aa6b7cb495f18..cafee32ada686 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -199,6 +199,60 @@ define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp { ret <2 x float> %vcvt1.i } +; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64) +; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64) +define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { +; GENERIC-LABEL: test_vcvt_bf16_f64: +; GENERIC: // %bb.0: +; GENERIC-NEXT: fcvtxn v0.2s, v0.2d +; GENERIC-NEXT: movi.4s v1, #127, msl #8 +; GENERIC-NEXT: movi.4s v2, #1 +; GENERIC-NEXT: ushr.4s v3, v0, #16 +; GENERIC-NEXT: add.4s v1, v0, v1 +; GENERIC-NEXT: and.16b v2, v3, v2 +; GENERIC-NEXT: add.4s v1, v2, v1 +; GENERIC-NEXT: fcmeq.4s v2, v0, v0 +; GENERIC-NEXT: orr.4s v0, #64, lsl #16 +; GENERIC-NEXT: bit.16b v0, v1, v2 +; GENERIC-NEXT: shrn.4h v0, v0, #16 +; GENERIC-NEXT: ret +; +; FAST-LABEL: test_vcvt_bf16_f64: +; FAST: // %bb.0: +; FAST-NEXT: fcvtxn v1.2s, v0.2d +; FAST-NEXT: // implicit-def: $q0 +; FAST-NEXT: fmov d0, d1 +; FAST-NEXT: ushr.4s v1, v0, #16 +; FAST-NEXT: movi.4s v2, #1 +; FAST-NEXT: and.16b v1, v1, v2 +; FAST-NEXT: add.4s v1, v1, v0 +; FAST-NEXT: movi.4s v2, #127, msl #8 +; FAST-NEXT: add.4s v1, v1, v2 +; FAST-NEXT: mov.16b v2, v0 +; FAST-NEXT: orr.4s v2, #64, lsl #16 +; FAST-NEXT: fcmeq.4s v0, v0, v0 +; FAST-NEXT: bsl.16b v0, v1, v2 +; FAST-NEXT: shrn.4h v0, v0, #16 +; FAST-NEXT: ret +; +; GISEL-LABEL: test_vcvt_bf16_f64: +; GISEL: // %bb.0: +; GISEL-NEXT: fcvtxn v0.2s, v0.2d +; GISEL-NEXT: movi.4s v1, #127, msl #8 +; GISEL-NEXT: movi.4s v2, #1 +; GISEL-NEXT: ushr.4s v3, v0, #16 +; GISEL-NEXT: add.4s v1, v0, v1 +; GISEL-NEXT: and.16b v2, v3, v2 +; GISEL-NEXT: add.4s v1, v2, v1 +; GISEL-NEXT: fcmeq.4s v2, v0, v0 +; GISEL-NEXT: orr.4s v0, #64, lsl #16 +; GISEL-NEXT: bit.16b v0, v1, v2 +; GISEL-NEXT: shrn.4h v0, v0, #16 +; GISEL-NEXT: ret + %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat> + ret <2 x bfloat> %vcvt1.i +} + define half @test_vcvt_f16_f32(<1 x float> %x) { ; GENERIC-LABEL: test_vcvt_f16_f32: ; GENERIC: // %bb.0: @@ -350,3 +404,5 @@ define float @from_half(i16 %in) { declare float @llvm.convert.from.fp16.f32(i16) #1 declare i16 @llvm.convert.to.fp16.f32(float) #1 +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FALLBACK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll index 7ff61d9bcb0cf..ded343b990ac1 100644 --- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -312,25 +312,12 @@ define <8 x half> @s_to_h(<8 x float> %a) { define <8 x half> @d_to_h(<8 x double> %a) { ; CHECK-LABEL: d_to_h: ; CHECK: // %bb.0: -; CHECK-NEXT: mov d5, v0.d[1] -; CHECK-NEXT: fcvt h0, d0 -; CHECK-NEXT: fcvt h4, d1 -; CHECK-NEXT: mov d1, v1.d[1] -; CHECK-NEXT: fcvt h5, d5 -; CHECK-NEXT: fcvt h1, d1 -; CHECK-NEXT: mov v0.h[1], v5.h[0] -; CHECK-NEXT: mov v0.h[2], v4.h[0] -; CHECK-NEXT: mov v0.h[3], v1.h[0] -; CHECK-NEXT: fcvt h1, d2 -; CHECK-NEXT: mov d2, v2.d[1] -; CHECK-NEXT: mov v0.h[4], v1.h[0] -; CHECK-NEXT: fcvt h1, d2 -; CHECK-NEXT: mov d2, v3.d[1] -; CHECK-NEXT: mov v0.h[5], v1.h[0] -; CHECK-NEXT: fcvt h1, d3 -; CHECK-NEXT: mov v0.h[6], v1.h[0] -; CHECK-NEXT: fcvt h1, d2 -; CHECK-NEXT: mov v0.h[7], v1.h[0] +; CHECK-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-NEXT: fcvtxn v2.2s, v2.2d +; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: fcvtn2 v0.8h, v2.4s ; CHECK-NEXT: ret %1 = fptrunc <8 x double> %a to <8 x half> ret <8 x half> %1 @@ -349,25 +336,12 @@ define <8 x float> @h_to_s(<8 x half> %a) { define <8 x double> @h_to_d(<8 x half> %a) { ; CHECK-LABEL: h_to_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h3, v0.h[3] -; CHECK-NEXT: mov h4, v0.h[2] -; CHECK-NEXT: fcvt d0, h0 -; CHECK-NEXT: mov h5, v2.h[1] -; CHECK-NEXT: mov h6, v2.h[3] -; CHECK-NEXT: mov h7, v2.h[2] -; CHECK-NEXT: fcvt d16, h1 -; CHECK-NEXT: fcvt d17, h3 -; CHECK-NEXT: fcvt d1, h4 -; CHECK-NEXT: fcvt d2, h2 -; CHECK-NEXT: fcvt d4, h5 -; CHECK-NEXT: fcvt d5, h6 -; CHECK-NEXT: fcvt d3, h7 -; CHECK-NEXT: mov v0.d[1], v16.d[0] -; CHECK-NEXT: mov v1.d[1], v17.d[0] -; CHECK-NEXT: mov v2.d[1], v4.d[0] -; CHECK-NEXT: mov v3.d[1], v5.d[0] +; CHECK-NEXT: fcvtl v1.4s, v0.4h +; CHECK-NEXT: fcvtl2 v2.4s, v0.8h +; CHECK-NEXT: fcvtl v0.2d, v1.2s +; CHECK-NEXT: fcvtl2 v3.2d, v2.4s +; CHECK-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-NEXT: fcvtl v2.2d, v2.2s ; CHECK-NEXT: ret %1 = fpext <8 x half> %a to <8 x double> ret <8 x double> %1 diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll index eca3389bcd88b..86f7322f7c4ee 100644 --- a/llvm/test/CodeGen/AArch64/fpext.ll +++ b/llvm/test/CodeGen/AArch64/fpext.ll @@ -85,29 +85,46 @@ entry: } define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) { -; CHECK-LABEL: fpext_v2f16_v2f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt d0, h0 -; CHECK-NEXT: fcvt d1, h1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fpext_v2f16_v2f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fpext_v2f16_v2f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: fcvt d0, h0 +; CHECK-GI-NEXT: fcvt d1, h1 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ret entry: %c = fpext <2 x half> %a to <2 x double> ret <2 x double> %c } define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) { -; CHECK-LABEL: fpext_v3f16_v3f64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fcvt d0, h0 -; CHECK-NEXT: fcvt d1, h1 -; CHECK-NEXT: fcvt d2, h2 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fpext_v3f16_v3f64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fcvtl v1.4s, v0.4h +; CHECK-SD-NEXT: fcvtl v0.2d, v1.2s +; CHECK-SD-NEXT: fcvtl2 v2.2d, v1.4s +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fpext_v3f16_v3f64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov h1, v0.h[1] +; CHECK-GI-NEXT: mov h2, v0.h[2] +; CHECK-GI-NEXT: fcvt d0, h0 +; CHECK-GI-NEXT: fcvt d1, h1 +; CHECK-GI-NEXT: fcvt d2, h2 +; CHECK-GI-NEXT: ret entry: %c = fpext <3 x half> %a to <3 x double> ret <3 x double> %c @@ -116,16 +133,9 @@ entry: define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) { ; CHECK-SD-LABEL: fpext_v4f16_v4f64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: mov h1, v0.h[1] -; CHECK-SD-NEXT: mov h2, v0.h[3] -; CHECK-SD-NEXT: mov h3, v0.h[2] -; CHECK-SD-NEXT: fcvt d0, h0 -; CHECK-SD-NEXT: fcvt d4, h1 -; CHECK-SD-NEXT: fcvt d2, h2 -; CHECK-SD-NEXT: fcvt d1, h3 -; CHECK-SD-NEXT: mov v0.d[1], v4.d[0] -; CHECK-SD-NEXT: mov v1.d[1], v2.d[0] +; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h +; CHECK-SD-NEXT: fcvtl2 v1.2d, v0.4s +; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fpext_v4f16_v4f64: diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index 9425988af8349..3efc98ab5fd53 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -84,11 +84,8 @@ entry: define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) { ; CHECK-SD-LABEL: fptrunc_v2f64_v2f16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov d1, v0.d[1] -; CHECK-SD-NEXT: fcvt h0, d0 -; CHECK-SD-NEXT: fcvt h1, d1 -; CHECK-SD-NEXT: mov v0.h[1], v1.h[0] -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptrunc_v2f64_v2f16: @@ -135,16 +132,9 @@ entry: define <4 x half> @fptrunc_v4f64_v4f16(<4 x double> %a) { ; CHECK-SD-LABEL: fptrunc_v4f64_v4f16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov d2, v0.d[1] -; CHECK-SD-NEXT: fcvt h0, d0 -; CHECK-SD-NEXT: fcvt h2, d2 -; CHECK-SD-NEXT: mov v0.h[1], v2.h[0] -; CHECK-SD-NEXT: fcvt h2, d1 -; CHECK-SD-NEXT: mov d1, v1.d[1] -; CHECK-SD-NEXT: mov v0.h[2], v2.h[0] -; CHECK-SD-NEXT: fcvt h1, d1 -; CHECK-SD-NEXT: mov v0.h[3], v1.h[0] -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d +; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d +; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: fptrunc_v4f64_v4f16: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index da9b79a56a951..2ace0bca274af 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -427,49 +427,35 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v4f16_v4f64: ; SVE: // %bb.0: -; SVE-NEXT: sub sp, sp, #16 -; SVE-NEXT: .cfi_def_cfa_offset 16 -; SVE-NEXT: ldp q1, q0, [x1] -; SVE-NEXT: ldr d4, [x0] -; SVE-NEXT: and z4.h, z4.h, #0x7fff -; SVE-NEXT: mov z2.d, z0.d[1] -; SVE-NEXT: mov z3.d, z1.d[1] -; SVE-NEXT: fcvt h0, d0 -; SVE-NEXT: fcvt h1, d1 -; SVE-NEXT: fcvt h2, d2 -; SVE-NEXT: fcvt h3, d3 -; SVE-NEXT: str h0, [sp, #12] -; SVE-NEXT: str h1, [sp, #8] -; SVE-NEXT: str h2, [sp, #14] -; SVE-NEXT: str h3, [sp, #10] -; SVE-NEXT: ldr d0, [sp, #8] +; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: ptrue p0.s, vl2 +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: fcvtxn v1.2s, v1.2d +; SVE-NEXT: fcvtxn v0.2s, v0.2d +; SVE-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE-NEXT: ldr d1, [x0] +; SVE-NEXT: and z1.h, z1.h, #0x7fff +; SVE-NEXT: fcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE-NEXT: and z0.h, z0.h, #0x8000 -; SVE-NEXT: orr z0.d, z4.d, z0.d +; SVE-NEXT: orr z0.d, z1.d, z0.d ; SVE-NEXT: str d0, [x0] -; SVE-NEXT: add sp, sp, #16 ; SVE-NEXT: ret ; ; SVE2-LABEL: test_copysign_v4f16_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: sub sp, sp, #16 -; SVE2-NEXT: .cfi_def_cfa_offset 16 -; SVE2-NEXT: ldp q2, q1, [x1] -; SVE2-NEXT: mov z0.h, #32767 // =0x7fff -; SVE2-NEXT: ldr d5, [x0] -; SVE2-NEXT: mov z3.d, z1.d[1] -; SVE2-NEXT: mov z4.d, z2.d[1] -; SVE2-NEXT: fcvt h1, d1 -; SVE2-NEXT: fcvt h2, d2 -; SVE2-NEXT: fcvt h3, d3 -; SVE2-NEXT: fcvt h4, d4 -; SVE2-NEXT: str h1, [sp, #12] -; SVE2-NEXT: str h2, [sp, #8] -; SVE2-NEXT: str h3, [sp, #14] -; SVE2-NEXT: str h4, [sp, #10] -; SVE2-NEXT: ldr d1, [sp, #8] -; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d -; SVE2-NEXT: str d5, [x0] -; SVE2-NEXT: add sp, sp, #16 +; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ptrue p0.s, vl2 +; SVE2-NEXT: ptrue p1.s +; SVE2-NEXT: ldr d2, [x0] +; SVE2-NEXT: fcvtxn v1.2s, v1.2d +; SVE2-NEXT: fcvtxn v0.2s, v0.2d +; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE2-NEXT: mov z1.h, #32767 // =0x7fff +; SVE2-NEXT: fcvt z0.h, p1/m, z0.s +; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h +; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d +; SVE2-NEXT: str d2, [x0] ; SVE2-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x double>, ptr %bp diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll index c33759331bbc8..de26676b5c73e 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -209,16 +209,10 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 { define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v4f16_v4f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov d3, v1[1] -; CHECK-NEXT: fcvt h1, d1 -; CHECK-NEXT: fcvt h3, d3 -; CHECK-NEXT: mov.h v1[1], v3[0] -; CHECK-NEXT: fcvt h3, d2 -; CHECK-NEXT: mov d2, v2[1] -; CHECK-NEXT: mov.h v1[2], v3[0] -; CHECK-NEXT: fcvt h2, d2 -; CHECK-NEXT: mov.h v1[3], v2[0] +; CHECK-NEXT: fcvtxn v1.2s, v1.2d +; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d ; CHECK-NEXT: mvni.4h v2, #128, lsl #8 +; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x half> @@ -291,42 +285,20 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b) define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v4bf16_v4f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov d3, v1[1] -; CHECK-NEXT: fcvtxn s1, d1 -; CHECK-NEXT: mov w8, #32767 ; =0x7fff -; CHECK-NEXT: fcvtxn s3, d3 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: ubfx w12, w10, #16, #1 -; CHECK-NEXT: add w10, w10, w8 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fcvtxn s3, d2 -; CHECK-NEXT: mov d2, v2[1] -; CHECK-NEXT: add w10, w12, w10 -; CHECK-NEXT: lsr w10, w10, #16 -; CHECK-NEXT: ubfx w11, w9, #16, #1 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: fcvtxn s1, d2 -; CHECK-NEXT: add w9, w11, w9 -; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: fmov s3, w10 -; CHECK-NEXT: lsr w9, w9, #16 -; CHECK-NEXT: ubfx w12, w11, #16, #1 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: add w9, w11, w8 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: add w9, w12, w9 -; CHECK-NEXT: lsr w9, w9, #16 -; CHECK-NEXT: mov.h v3[1], v2[0] -; CHECK-NEXT: ubfx w11, w10, #16, #1 -; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: add w8, w11, w8 -; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: mov.h v3[2], v1[0] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov.h v3[3], v1[0] -; CHECK-NEXT: mvni.4h v1, #128, lsl #8 -; CHECK-NEXT: bif.8b v0, v3, v1 +; CHECK-NEXT: fcvtxn v1.2s, v1.2d +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d +; CHECK-NEXT: movi.4s v2, #127, msl #8 +; CHECK-NEXT: ushr.4s v4, v1, #16 +; CHECK-NEXT: add.4s v2, v1, v2 +; CHECK-NEXT: and.16b v3, v4, v3 +; CHECK-NEXT: add.4s v2, v3, v2 +; CHECK-NEXT: fcmeq.4s v3, v1, v1 +; CHECK-NEXT: orr.4s v1, #64, lsl #16 +; CHECK-NEXT: bit.16b v1, v2, v3 +; CHECK-NEXT: mvni.4h v2, #128, lsl #8 +; CHECK-NEXT: shrn.4h v1, v1, #16 +; CHECK-NEXT: bif.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x bfloat> %r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %tmp0)