diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 23328ed57fb36..b7011e0ea1669 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22189,6 +22189,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } +static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::i16) + return SDValue(); + + SDLoc DL(N); + SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1)); + SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast); +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -22442,6 +22453,26 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_uabd: return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fcvtzs: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtzu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtas: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtau: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtms: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtmu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtns: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtnu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtps: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtpu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1d2dda5894b15..ce40e202f30f5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -991,6 +991,14 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn), def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>; def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>; +def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>; +def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>; +def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>; +def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>; +def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>; +def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>; +def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>; +def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>; //def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>; @@ -6586,6 +6594,14 @@ class F16ToI16ScalarPat let Predicates = [HasFullFP16] in { def : F16ToI16ScalarPat; def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; +def : F16ToI16ScalarPat; } // Round FP64 to BF16. diff --git a/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll new file mode 100644 index 0000000000000..ab502508fadbd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp16_i16_intrinsic_scalar.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s + +; Test f16 -> i16 NEON intrinics, currently only supported in SDAG. +; Should be merged with fp16_intrinsic_scalar_1op.ll once there is +; support in GlSel. + +declare i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtas.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtau.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtms.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtns.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtps.i16.f16(half) +declare i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half) + + +define i16 @fcvtzs_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtzs_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtzu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtzu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtas_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtas_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtas h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtau_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtau_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtau h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtms_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtms_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtms h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtmu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtmu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtmu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtns_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtns_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtns h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtnu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtnu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtnu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtps_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtps_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtps h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a) + ret i16 %fcvt +} + +define i16 @fcvtpu_intrinsic_i16(half %a) { +; CHECK-LABEL: fcvtpu_intrinsic_i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtpu h0, h0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + %fcvt = tail call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a) + ret i16 %fcvt +} diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll index 58cbc2953dbcd..b4fc8971ede8a 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_vector_1op.ll @@ -1,13 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16 | FileCheck %s declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>) declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>) declare <4 x half> @llvm.sqrt.v4f16(<4 x half>) declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half>) +declare <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half>) define dso_local <4 x half> @t_vrndi_f16(<4 x half> %a) { ; CHECK-LABEL: t_vrndi_f16: -; CHECK: frinti v0.4h, v0.4h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti v0.4h, v0.4h ; CHECK-NEXT: ret entry: %vrndi1.i = tail call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a) @@ -16,7 +28,8 @@ entry: define dso_local <8 x half> @t_vrndiq_f16(<8 x half> %a) { ; CHECK-LABEL: t_vrndiq_f16: -; CHECK: frinti v0.8h, v0.8h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinti v0.8h, v0.8h ; CHECK-NEXT: ret entry: %vrndi1.i = tail call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a) @@ -25,7 +38,8 @@ entry: define dso_local <4 x half> @t_vsqrt_f16(<4 x half> %a) { ; CHECK-LABEL: t_vsqrt_f16: -; CHECK: fsqrt v0.4h, v0.4h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4h, v0.4h ; CHECK-NEXT: ret entry: %vsqrt.i = tail call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a) @@ -34,9 +48,110 @@ entry: define dso_local <8 x half> @t_vsqrtq_f16(<8 x half> %a) { ; CHECK-LABEL: t_vsqrtq_f16: -; CHECK: fsqrt v0.8h, v0.8h +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.8h, v0.8h ; CHECK-NEXT: ret entry: %vsqrt.i = tail call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a) ret <8 x half> %vsqrt.i } + +define <4 x i16> @t_fcvtzs_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtzs_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtzu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtzu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtas_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtas_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtas v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtau_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtau_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtau v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtms_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtms_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtms v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtmu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtmu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtmu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtns_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtns_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtns v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtnu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtnu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtnu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtps_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtps_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtps v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +} + +define <4 x i16> @t_fcvtpu_v4i16_v4f16(<4 x half> %a) { +; CHECK-LABEL: t_fcvtpu_v4i16_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtpu v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a) + ret <4 x i16> %vcvt +}