102 changes: 98 additions & 4 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def HasRDM : Predicate<"Subtarget->hasRDM()">,
AssemblerPredicateWithAll<(all_of FeatureRDM), "rdm">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicateWithAll<(all_of FeatureFullFP16), "fullfp16">;
def HasNoFullFP16 : Predicate<"!Subtarget->hasFullFP16()">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
AssemblerPredicateWithAll<(all_of FeatureFP16FML), "fp16fml">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
Expand Down Expand Up @@ -254,6 +255,7 @@ def HasTRBE : Predicate<"Subtarget->hasTRBE()">,
AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">,
AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
def HasNoBF16 : Predicate<"!Subtarget->hasBF16()">;
def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">,
Expand Down Expand Up @@ -757,9 +759,14 @@ def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;

def AArch64fcvtxn_n: SDNode<"AArch64ISD::FCVTXN", SDTFPRoundOp>;
def AArch64fcvtxn: PatFrags<(ops node:$Rn),
[(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
(f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
def AArch64fcvtxnsdr: PatFrags<(ops node:$Rn),
[(f32 (int_aarch64_sisd_fcvtxn (f64 node:$Rn))),
(f32 (AArch64fcvtxn_n (f64 node:$Rn)))]>;
def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
[(int_aarch64_neon_fcvtxn node:$Rn),
(AArch64fcvtxn_n node:$Rn)]>;

//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;

def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
Expand Down Expand Up @@ -5042,7 +5049,7 @@ def : Pat<(concat_vectors V64:$Rd, (v4f16 (any_fpround (v4f32 V128:$Rn)))),
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
int_aarch64_neon_fcvtxn>;
AArch64fcvtxnv>;
defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;

Expand Down Expand Up @@ -9736,6 +9743,93 @@ let Predicates = [HasCPA] in {
def MSUBPT : MulAccumCPA<1, "msubpt">;
}

def round_v4fp32_to_v4bf16 :
OutPatFrag<(ops node:$Rn),
// NaN? Round : Quiet(NaN)
(BSPv16i8 (FCMEQv4f32 $Rn, $Rn),
(ADDv4i32
(ADDv4i32 $Rn,
// Extract the LSB of the fp32 *truncated* to bf16.
(ANDv16i8 (USHRv4i32_shift V128:$Rn, (i32 16)),
(MOVIv4i32 (i32 1), (i32 0)))),
// Bias which will help us break ties correctly.
(MOVIv4s_msl (i32 127), (i32 264))),
// Set the quiet bit in the NaN.
(ORRv4i32 $Rn, (i32 64), (i32 16)))>;

multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst> {
let Predicates = [HasNoFullFP16] in
def : Pat<(InOp (v8f16 V128:$Rn)),
(v8f16 (FCVTNv8i16
(INSERT_SUBREG (IMPLICIT_DEF),
(v4f16 (FCVTNv4i16
(v4f32 (OutInst
(v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (FCVTLv8i16 V128:$Rn))))))>;

let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;

let Predicates = [HasNoBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(UZP2v8i16
(round_v4fp32_to_v4bf16 (v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub))))))),
(round_v4fp32_to_v4bf16 (v4f32 (OutInst
(v4f32 (SHLLv8i16 V128:$Rn))))))>;
}
defm : PromoteUnaryv8f16Tov4f32<any_fceil, FRINTPv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_ffloor, FRINTMv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_fnearbyint, FRINTIv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_fround, FRINTAv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_froundeven, FRINTNv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_frint, FRINTXv4f32>;
defm : PromoteUnaryv8f16Tov4f32<any_ftrunc, FRINTZv4f32>;

multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst> {
let Predicates = [HasNoFullFP16] in
def : Pat<(InOp (v8f16 V128:$Rn), (v8f16 V128:$Rm)),
(v8f16 (FCVTNv8i16
(INSERT_SUBREG (IMPLICIT_DEF),
(v4f16 (FCVTNv4i16
(v4f32 (OutInst
(v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (FCVTLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (FCVTLv8i16 V128:$Rn)),
(v4f32 (FCVTLv8i16 V128:$Rm))))))>;

let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;

let Predicates = [HasNoBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(UZP2v8i16
(round_v4fp32_to_v4bf16 (v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub))))))),
(round_v4fp32_to_v4bf16 (v4f32 (OutInst
(v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
}
defm : PromoteBinaryv8f16Tov4f32<any_fadd, FADDv4f32>;
defm : PromoteBinaryv8f16Tov4f32<any_fdiv, FDIVv4f32>;
defm : PromoteBinaryv8f16Tov4f32<any_fmul, FMULv4f32>;
defm : PromoteBinaryv8f16Tov4f32<any_fsub, FSUBv4f32>;

include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
include "AArch64SMEInstrInfo.td"
Expand Down
56 changes: 56 additions & 0 deletions llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,60 @@ define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
ret <2 x float> %vcvt1.i
}

; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
; GENERIC-LABEL: test_vcvt_bf16_f64:
; GENERIC: // %bb.0:
; GENERIC-NEXT: fcvtxn v0.2s, v0.2d
; GENERIC-NEXT: movi.4s v1, #127, msl #8
; GENERIC-NEXT: movi.4s v2, #1
; GENERIC-NEXT: ushr.4s v3, v0, #16
; GENERIC-NEXT: add.4s v1, v0, v1
; GENERIC-NEXT: and.16b v2, v3, v2
; GENERIC-NEXT: add.4s v1, v2, v1
; GENERIC-NEXT: fcmeq.4s v2, v0, v0
; GENERIC-NEXT: orr.4s v0, #64, lsl #16
; GENERIC-NEXT: bit.16b v0, v1, v2
; GENERIC-NEXT: shrn.4h v0, v0, #16
; GENERIC-NEXT: ret
;
; FAST-LABEL: test_vcvt_bf16_f64:
; FAST: // %bb.0:
; FAST-NEXT: fcvtxn v1.2s, v0.2d
; FAST-NEXT: // implicit-def: $q0
; FAST-NEXT: fmov d0, d1
; FAST-NEXT: ushr.4s v1, v0, #16
; FAST-NEXT: movi.4s v2, #1
; FAST-NEXT: and.16b v1, v1, v2
; FAST-NEXT: add.4s v1, v1, v0
; FAST-NEXT: movi.4s v2, #127, msl #8
; FAST-NEXT: add.4s v1, v1, v2
; FAST-NEXT: mov.16b v2, v0
; FAST-NEXT: orr.4s v2, #64, lsl #16
; FAST-NEXT: fcmeq.4s v0, v0, v0
; FAST-NEXT: bsl.16b v0, v1, v2
; FAST-NEXT: shrn.4h v0, v0, #16
; FAST-NEXT: ret
;
; GISEL-LABEL: test_vcvt_bf16_f64:
; GISEL: // %bb.0:
; GISEL-NEXT: fcvtxn v0.2s, v0.2d
; GISEL-NEXT: movi.4s v1, #127, msl #8
; GISEL-NEXT: movi.4s v2, #1
; GISEL-NEXT: ushr.4s v3, v0, #16
; GISEL-NEXT: add.4s v1, v0, v1
; GISEL-NEXT: and.16b v2, v3, v2
; GISEL-NEXT: add.4s v1, v2, v1
; GISEL-NEXT: fcmeq.4s v2, v0, v0
; GISEL-NEXT: orr.4s v0, #64, lsl #16
; GISEL-NEXT: bit.16b v0, v1, v2
; GISEL-NEXT: shrn.4h v0, v0, #16
; GISEL-NEXT: ret
%vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
ret <2 x bfloat> %vcvt1.i
}

define half @test_vcvt_f16_f32(<1 x float> %x) {
; GENERIC-LABEL: test_vcvt_f16_f32:
; GENERIC: // %bb.0:
Expand Down Expand Up @@ -350,3 +404,5 @@ define float @from_half(i16 %in) {

declare float @llvm.convert.from.fp16.f32(i16) #1
declare i16 @llvm.convert.to.fp16.f32(float) #1
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; FALLBACK: {{.*}}
218 changes: 45 additions & 173 deletions llvm/test/CodeGen/AArch64/faddp-half.ll
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,15 @@ define half @faddp_8xhalf(<8 x half> %a) {
; CHECKNOFP16-LABEL: faddp_8xhalf:
; CHECKNOFP16: // %bb.0: // %entry
; CHECKNOFP16-NEXT: dup v1.8h, v0.h[1]
; CHECKNOFP16-NEXT: fcvt s0, h0
; CHECKNOFP16-NEXT: fcvt s1, h1
; CHECKNOFP16-NEXT: fadd s0, s0, s1
; CHECKNOFP16-NEXT: fcvt h0, s0
; CHECKNOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECKNOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECKNOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECKNOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECKNOFP16-NEXT: fadd v2.4s, v2.4s, v3.4s
; CHECKNOFP16-NEXT: fadd v1.4s, v0.4s, v1.4s
; CHECKNOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECKNOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECKNOFP16-NEXT: ret
entry:
%shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -128,10 +133,15 @@ define half @faddp_8xhalf_commute(<8 x half> %a) {
; CHECKNOFP16-LABEL: faddp_8xhalf_commute:
; CHECKNOFP16: // %bb.0: // %entry
; CHECKNOFP16-NEXT: dup v1.8h, v0.h[1]
; CHECKNOFP16-NEXT: fcvt s0, h0
; CHECKNOFP16-NEXT: fcvt s1, h1
; CHECKNOFP16-NEXT: fadd s0, s1, s0
; CHECKNOFP16-NEXT: fcvt h0, s0
; CHECKNOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECKNOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECKNOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECKNOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECKNOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
; CHECKNOFP16-NEXT: fadd v1.4s, v1.4s, v0.4s
; CHECKNOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECKNOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECKNOFP16-NEXT: ret
entry:
%shift = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand All @@ -149,61 +159,15 @@ define <8 x half> @addp_v8f16(<8 x half> %a) {
;
; CHECKNOFP16-LABEL: addp_v8f16:
; CHECKNOFP16: // %bb.0: // %entry
; CHECKNOFP16-NEXT: rev32 v2.8h, v0.8h
; CHECKNOFP16-NEXT: mov h1, v0.h[1]
; CHECKNOFP16-NEXT: fcvt s4, h0
; CHECKNOFP16-NEXT: mov h5, v0.h[2]
; CHECKNOFP16-NEXT: mov h16, v0.h[3]
; CHECKNOFP16-NEXT: mov h3, v2.h[1]
; CHECKNOFP16-NEXT: fcvt s6, h2
; CHECKNOFP16-NEXT: fcvt s1, h1
; CHECKNOFP16-NEXT: mov h7, v2.h[2]
; CHECKNOFP16-NEXT: fcvt s5, h5
; CHECKNOFP16-NEXT: fcvt s16, h16
; CHECKNOFP16-NEXT: fcvt s3, h3
; CHECKNOFP16-NEXT: fadd s4, s6, s4
; CHECKNOFP16-NEXT: mov h6, v2.h[3]
; CHECKNOFP16-NEXT: fcvt s7, h7
; CHECKNOFP16-NEXT: fadd s3, s3, s1
; CHECKNOFP16-NEXT: fcvt s6, h6
; CHECKNOFP16-NEXT: fcvt h1, s4
; CHECKNOFP16-NEXT: fadd s4, s7, s5
; CHECKNOFP16-NEXT: mov h5, v0.h[4]
; CHECKNOFP16-NEXT: mov h7, v2.h[4]
; CHECKNOFP16-NEXT: fcvt h3, s3
; CHECKNOFP16-NEXT: fadd s6, s6, s16
; CHECKNOFP16-NEXT: mov h16, v2.h[5]
; CHECKNOFP16-NEXT: fcvt h4, s4
; CHECKNOFP16-NEXT: mov v1.h[1], v3.h[0]
; CHECKNOFP16-NEXT: fcvt s3, h5
; CHECKNOFP16-NEXT: fcvt s5, h7
; CHECKNOFP16-NEXT: mov h7, v0.h[5]
; CHECKNOFP16-NEXT: fcvt h6, s6
; CHECKNOFP16-NEXT: fcvt s16, h16
; CHECKNOFP16-NEXT: mov v1.h[2], v4.h[0]
; CHECKNOFP16-NEXT: mov h4, v0.h[6]
; CHECKNOFP16-NEXT: fadd s3, s5, s3
; CHECKNOFP16-NEXT: mov h5, v2.h[6]
; CHECKNOFP16-NEXT: fcvt s7, h7
; CHECKNOFP16-NEXT: mov h0, v0.h[7]
; CHECKNOFP16-NEXT: mov h2, v2.h[7]
; CHECKNOFP16-NEXT: mov v1.h[3], v6.h[0]
; CHECKNOFP16-NEXT: fcvt s4, h4
; CHECKNOFP16-NEXT: fcvt h3, s3
; CHECKNOFP16-NEXT: fcvt s5, h5
; CHECKNOFP16-NEXT: fadd s6, s16, s7
; CHECKNOFP16-NEXT: fcvt s0, h0
; CHECKNOFP16-NEXT: fcvt s2, h2
; CHECKNOFP16-NEXT: mov v1.h[4], v3.h[0]
; CHECKNOFP16-NEXT: fadd s4, s5, s4
; CHECKNOFP16-NEXT: fcvt h3, s6
; CHECKNOFP16-NEXT: fadd s0, s2, s0
; CHECKNOFP16-NEXT: mov v1.h[5], v3.h[0]
; CHECKNOFP16-NEXT: fcvt h3, s4
; CHECKNOFP16-NEXT: fcvt h0, s0
; CHECKNOFP16-NEXT: mov v1.h[6], v3.h[0]
; CHECKNOFP16-NEXT: mov v1.h[7], v0.h[0]
; CHECKNOFP16-NEXT: mov v0.16b, v1.16b
; CHECKNOFP16-NEXT: rev32 v1.8h, v0.8h
; CHECKNOFP16-NEXT: fcvtl v2.4s, v0.4h
; CHECKNOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECKNOFP16-NEXT: fcvtl v3.4s, v1.4h
; CHECKNOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECKNOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
; CHECKNOFP16-NEXT: fadd v1.4s, v1.4s, v0.4s
; CHECKNOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECKNOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECKNOFP16-NEXT: ret
entry:
%s = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
Expand All @@ -221,116 +185,24 @@ define <16 x half> @addp_v16f16(<16 x half> %a) {
;
; CHECKNOFP16-LABEL: addp_v16f16:
; CHECKNOFP16: // %bb.0: // %entry
; CHECKNOFP16-NEXT: rev32 v5.8h, v0.8h
; CHECKNOFP16-NEXT: rev32 v4.8h, v1.8h
; CHECKNOFP16-NEXT: mov h3, v0.h[1]
; CHECKNOFP16-NEXT: mov h6, v1.h[1]
; CHECKNOFP16-NEXT: fcvt s16, h0
; CHECKNOFP16-NEXT: mov h17, v0.h[2]
; CHECKNOFP16-NEXT: fcvt s20, h1
; CHECKNOFP16-NEXT: mov h21, v1.h[2]
; CHECKNOFP16-NEXT: mov h2, v5.h[1]
; CHECKNOFP16-NEXT: mov h7, v4.h[1]
; CHECKNOFP16-NEXT: fcvt s3, h3
; CHECKNOFP16-NEXT: fcvt s18, h5
; CHECKNOFP16-NEXT: mov h19, v5.h[2]
; CHECKNOFP16-NEXT: fcvt s6, h6
; CHECKNOFP16-NEXT: fcvt s22, h4
; CHECKNOFP16-NEXT: mov h23, v4.h[2]
; CHECKNOFP16-NEXT: fcvt s17, h17
; CHECKNOFP16-NEXT: mov h24, v5.h[3]
; CHECKNOFP16-NEXT: fcvt s21, h21
; CHECKNOFP16-NEXT: mov h25, v4.h[6]
; CHECKNOFP16-NEXT: fcvt s2, h2
; CHECKNOFP16-NEXT: fcvt s7, h7
; CHECKNOFP16-NEXT: fadd s16, s18, s16
; CHECKNOFP16-NEXT: fcvt s18, h19
; CHECKNOFP16-NEXT: mov h19, v0.h[3]
; CHECKNOFP16-NEXT: fadd s20, s22, s20
; CHECKNOFP16-NEXT: fcvt s22, h23
; CHECKNOFP16-NEXT: mov h23, v4.h[3]
; CHECKNOFP16-NEXT: fadd s3, s2, s3
; CHECKNOFP16-NEXT: fadd s6, s7, s6
; CHECKNOFP16-NEXT: mov h7, v1.h[3]
; CHECKNOFP16-NEXT: fcvt h2, s16
; CHECKNOFP16-NEXT: fadd s16, s18, s17
; CHECKNOFP16-NEXT: fcvt s18, h19
; CHECKNOFP16-NEXT: fcvt s19, h24
; CHECKNOFP16-NEXT: mov h24, v5.h[6]
; CHECKNOFP16-NEXT: fcvt h17, s3
; CHECKNOFP16-NEXT: fcvt h3, s20
; CHECKNOFP16-NEXT: fadd s20, s22, s21
; CHECKNOFP16-NEXT: fcvt h6, s6
; CHECKNOFP16-NEXT: fcvt s7, h7
; CHECKNOFP16-NEXT: fcvt s22, h23
; CHECKNOFP16-NEXT: mov h21, v0.h[4]
; CHECKNOFP16-NEXT: mov h23, v5.h[4]
; CHECKNOFP16-NEXT: fcvt h16, s16
; CHECKNOFP16-NEXT: fadd s18, s19, s18
; CHECKNOFP16-NEXT: mov h19, v4.h[4]
; CHECKNOFP16-NEXT: mov v2.h[1], v17.h[0]
; CHECKNOFP16-NEXT: mov h17, v1.h[4]
; CHECKNOFP16-NEXT: mov v3.h[1], v6.h[0]
; CHECKNOFP16-NEXT: fcvt h6, s20
; CHECKNOFP16-NEXT: fadd s7, s22, s7
; CHECKNOFP16-NEXT: fcvt s20, h21
; CHECKNOFP16-NEXT: mov h21, v0.h[5]
; CHECKNOFP16-NEXT: mov h22, v5.h[5]
; CHECKNOFP16-NEXT: fcvt h18, s18
; CHECKNOFP16-NEXT: fcvt s19, h19
; CHECKNOFP16-NEXT: mov h5, v5.h[7]
; CHECKNOFP16-NEXT: mov v2.h[2], v16.h[0]
; CHECKNOFP16-NEXT: fcvt s16, h23
; CHECKNOFP16-NEXT: fcvt s17, h17
; CHECKNOFP16-NEXT: mov v3.h[2], v6.h[0]
; CHECKNOFP16-NEXT: fcvt h6, s7
; CHECKNOFP16-NEXT: mov h7, v1.h[5]
; CHECKNOFP16-NEXT: mov h23, v4.h[5]
; CHECKNOFP16-NEXT: mov h4, v4.h[7]
; CHECKNOFP16-NEXT: fcvt s5, h5
; CHECKNOFP16-NEXT: fadd s16, s16, s20
; CHECKNOFP16-NEXT: mov h20, v0.h[6]
; CHECKNOFP16-NEXT: fadd s17, s19, s17
; CHECKNOFP16-NEXT: mov h19, v1.h[6]
; CHECKNOFP16-NEXT: mov v2.h[3], v18.h[0]
; CHECKNOFP16-NEXT: fcvt s18, h21
; CHECKNOFP16-NEXT: fcvt s21, h22
; CHECKNOFP16-NEXT: mov v3.h[3], v6.h[0]
; CHECKNOFP16-NEXT: fcvt s6, h7
; CHECKNOFP16-NEXT: fcvt s7, h23
; CHECKNOFP16-NEXT: fcvt s22, h24
; CHECKNOFP16-NEXT: fcvt s23, h25
; CHECKNOFP16-NEXT: fcvt h16, s16
; CHECKNOFP16-NEXT: fcvt s20, h20
; CHECKNOFP16-NEXT: fcvt h17, s17
; CHECKNOFP16-NEXT: fcvt s19, h19
; CHECKNOFP16-NEXT: mov h0, v0.h[7]
; CHECKNOFP16-NEXT: mov h1, v1.h[7]
; CHECKNOFP16-NEXT: fadd s18, s21, s18
; CHECKNOFP16-NEXT: fcvt s4, h4
; CHECKNOFP16-NEXT: fadd s6, s7, s6
; CHECKNOFP16-NEXT: mov v2.h[4], v16.h[0]
; CHECKNOFP16-NEXT: fadd s7, s22, s20
; CHECKNOFP16-NEXT: mov v3.h[4], v17.h[0]
; CHECKNOFP16-NEXT: fadd s16, s23, s19
; CHECKNOFP16-NEXT: fcvt s0, h0
; CHECKNOFP16-NEXT: fcvt s1, h1
; CHECKNOFP16-NEXT: fcvt h17, s18
; CHECKNOFP16-NEXT: fcvt h6, s6
; CHECKNOFP16-NEXT: fadd s0, s5, s0
; CHECKNOFP16-NEXT: fcvt h5, s7
; CHECKNOFP16-NEXT: fadd s1, s4, s1
; CHECKNOFP16-NEXT: mov v2.h[5], v17.h[0]
; CHECKNOFP16-NEXT: mov v3.h[5], v6.h[0]
; CHECKNOFP16-NEXT: fcvt h6, s16
; CHECKNOFP16-NEXT: fcvt h0, s0
; CHECKNOFP16-NEXT: fcvt h1, s1
; CHECKNOFP16-NEXT: mov v2.h[6], v5.h[0]
; CHECKNOFP16-NEXT: mov v3.h[6], v6.h[0]
; CHECKNOFP16-NEXT: mov v2.h[7], v0.h[0]
; CHECKNOFP16-NEXT: mov v3.h[7], v1.h[0]
; CHECKNOFP16-NEXT: mov v0.16b, v2.16b
; CHECKNOFP16-NEXT: mov v1.16b, v3.16b
; CHECKNOFP16-NEXT: rev32 v2.8h, v0.8h
; CHECKNOFP16-NEXT: rev32 v3.8h, v1.8h
; CHECKNOFP16-NEXT: fcvtl v4.4s, v0.4h
; CHECKNOFP16-NEXT: fcvtl v6.4s, v1.4h
; CHECKNOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECKNOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECKNOFP16-NEXT: fcvtl v5.4s, v2.4h
; CHECKNOFP16-NEXT: fcvtl v7.4s, v3.4h
; CHECKNOFP16-NEXT: fcvtl2 v2.4s, v2.8h
; CHECKNOFP16-NEXT: fcvtl2 v3.4s, v3.8h
; CHECKNOFP16-NEXT: fadd v4.4s, v5.4s, v4.4s
; CHECKNOFP16-NEXT: fadd v5.4s, v7.4s, v6.4s
; CHECKNOFP16-NEXT: fadd v2.4s, v2.4s, v0.4s
; CHECKNOFP16-NEXT: fadd v3.4s, v3.4s, v1.4s
; CHECKNOFP16-NEXT: fcvtn v0.4h, v4.4s
; CHECKNOFP16-NEXT: fcvtn v1.4h, v5.4s
; CHECKNOFP16-NEXT: fcvtn2 v0.8h, v2.4s
; CHECKNOFP16-NEXT: fcvtn2 v1.8h, v3.4s
; CHECKNOFP16-NEXT: ret
entry:
%s = shufflevector <16 x half> %a, <16 x half> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
Expand Down
496 changes: 64 additions & 432 deletions llvm/test/CodeGen/AArch64/faddsub.ll

Large diffs are not rendered by default.

1,421 changes: 189 additions & 1,232 deletions llvm/test/CodeGen/AArch64/fcvt.ll

Large diffs are not rendered by default.

140 changes: 41 additions & 99 deletions llvm/test/CodeGen/AArch64/fcvt_combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,48 +180,19 @@ define <3 x i32> @test_illegal_fp_to_int(<3 x float> %in) {
define <8 x i16> @test_v8f16(<8 x half> %in) {
; CHECK-NO16-LABEL: test_v8f16:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: mov h2, v0.h[1]
; CHECK-NO16-NEXT: mov h3, v0.h[4]
; CHECK-NO16-NEXT: mov h4, v0.h[5]
; CHECK-NO16-NEXT: mov h5, v0.h[2]
; CHECK-NO16-NEXT: fcvt s6, h0
; CHECK-NO16-NEXT: mov h7, v0.h[6]
; CHECK-NO16-NEXT: fmov s1, #4.00000000
; CHECK-NO16-NEXT: mov h16, v0.h[3]
; CHECK-NO16-NEXT: mov h0, v0.h[7]
; CHECK-NO16-NEXT: fcvt s2, h2
; CHECK-NO16-NEXT: fcvt s3, h3
; CHECK-NO16-NEXT: fcvt s4, h4
; CHECK-NO16-NEXT: fmul s6, s6, s1
; CHECK-NO16-NEXT: fcvt s5, h5
; CHECK-NO16-NEXT: fcvt s7, h7
; CHECK-NO16-NEXT: fcvt s16, h16
; CHECK-NO16-NEXT: fcvt s0, h0
; CHECK-NO16-NEXT: fmul s2, s2, s1
; CHECK-NO16-NEXT: fmul s3, s3, s1
; CHECK-NO16-NEXT: fmul s4, s4, s1
; CHECK-NO16-NEXT: fmul s5, s5, s1
; CHECK-NO16-NEXT: fcvt h6, s6
; CHECK-NO16-NEXT: fmul s7, s7, s1
; CHECK-NO16-NEXT: fmul s16, s16, s1
; CHECK-NO16-NEXT: fmul s0, s0, s1
; CHECK-NO16-NEXT: fcvt h2, s2
; CHECK-NO16-NEXT: fcvt h3, s3
; CHECK-NO16-NEXT: fcvt h4, s4
; CHECK-NO16-NEXT: fcvt h5, s5
; CHECK-NO16-NEXT: fcvt h1, s7
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: mov v6.h[1], v2.h[0]
; CHECK-NO16-NEXT: fcvt h2, s16
; CHECK-NO16-NEXT: mov v3.h[1], v4.h[0]
; CHECK-NO16-NEXT: mov v6.h[2], v5.h[0]
; CHECK-NO16-NEXT: mov v3.h[2], v1.h[0]
; CHECK-NO16-NEXT: mov v6.h[3], v2.h[0]
; CHECK-NO16-NEXT: mov v3.h[3], v0.h[0]
; CHECK-NO16-NEXT: fcvtl v1.4s, v6.4h
; CHECK-NO16-NEXT: fcvtl v0.4s, v3.4h
; CHECK-NO16-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NO16-NEXT: movi v1.8h, #68, lsl #8
; CHECK-NO16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NO16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-NO16-NEXT: fmul v2.4s, v2.4s, v3.4s
; CHECK-NO16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-NO16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-NO16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v1.8h
; CHECK-NO16-NEXT: fcvtl v1.4s, v1.4h
; CHECK-NO16-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NO16-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NO16-NEXT: uzp1 v0.8h, v1.8h, v0.8h
; CHECK-NO16-NEXT: ret
;
Expand Down Expand Up @@ -496,96 +467,67 @@ define <3 x i32> @test_illegal_fp_to_int_sat_sat(<3 x float> %in) {
define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
; CHECK-NO16-LABEL: test_v8f16_sat:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: mov h2, v0.h[4]
; CHECK-NO16-NEXT: mov h3, v0.h[5]
; CHECK-NO16-NEXT: movi v1.8h, #68, lsl #8
; CHECK-NO16-NEXT: fcvtl v2.4s, v0.4h
; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff
; CHECK-NO16-NEXT: mov h4, v0.h[6]
; CHECK-NO16-NEXT: fmov s1, #4.00000000
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000
; CHECK-NO16-NEXT: mov h5, v0.h[7]
; CHECK-NO16-NEXT: mov h6, v0.h[1]
; CHECK-NO16-NEXT: mov h7, v0.h[2]
; CHECK-NO16-NEXT: fcvt s16, h0
; CHECK-NO16-NEXT: mov h0, v0.h[3]
; CHECK-NO16-NEXT: fcvt s2, h2
; CHECK-NO16-NEXT: fcvt s3, h3
; CHECK-NO16-NEXT: fcvt s4, h4
; CHECK-NO16-NEXT: fcvt s5, h5
; CHECK-NO16-NEXT: fcvt s6, h6
; CHECK-NO16-NEXT: fmul s2, s2, s1
; CHECK-NO16-NEXT: fmul s3, s3, s1
; CHECK-NO16-NEXT: fmul s4, s4, s1
; CHECK-NO16-NEXT: fmul s5, s5, s1
; CHECK-NO16-NEXT: fmul s6, s6, s1
; CHECK-NO16-NEXT: fcvt h2, s2
; CHECK-NO16-NEXT: fcvt h3, s3
; CHECK-NO16-NEXT: fcvt h4, s4
; CHECK-NO16-NEXT: fcvt h5, s5
; CHECK-NO16-NEXT: fcvt h6, s6
; CHECK-NO16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-NO16-NEXT: fcvt s3, h7
; CHECK-NO16-NEXT: fmul s7, s16, s1
; CHECK-NO16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-NO16-NEXT: fcvt s4, h0
; CHECK-NO16-NEXT: fmul s3, s3, s1
; CHECK-NO16-NEXT: fcvt h0, s7
; CHECK-NO16-NEXT: mov v2.h[3], v5.h[0]
; CHECK-NO16-NEXT: fmul s1, s4, s1
; CHECK-NO16-NEXT: fcvt h3, s3
; CHECK-NO16-NEXT: mov v0.h[1], v6.h[0]
; CHECK-NO16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: mov v0.h[2], v3.h[0]
; CHECK-NO16-NEXT: mov s4, v2.s[1]
; CHECK-NO16-NEXT: fcvtzs w10, s2
; CHECK-NO16-NEXT: mov v0.h[3], v1.h[0]
; CHECK-NO16-NEXT: mov s1, v2.s[2]
; CHECK-NO16-NEXT: mov s2, v2.s[3]
; CHECK-NO16-NEXT: fcvtzs w9, s4
; CHECK-NO16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-NO16-NEXT: fcvtzs w12, s1
; CHECK-NO16-NEXT: fcvtzs w13, s2
; CHECK-NO16-NEXT: fcvtl v3.4s, v1.4h
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-NO16-NEXT: fmul v2.4s, v2.4s, v3.4s
; CHECK-NO16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-NO16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-NO16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v1.8h
; CHECK-NO16-NEXT: fcvtl v1.4s, v1.4h
; CHECK-NO16-NEXT: mov s2, v0.s[1]
; CHECK-NO16-NEXT: fcvtzs w10, s0
; CHECK-NO16-NEXT: fcvtzs w15, s1
; CHECK-NO16-NEXT: fcvtzs w9, s2
; CHECK-NO16-NEXT: mov s2, v0.s[2]
; CHECK-NO16-NEXT: mov s0, v0.s[3]
; CHECK-NO16-NEXT: cmp w9, w8
; CHECK-NO16-NEXT: fcvtzs w12, s2
; CHECK-NO16-NEXT: mov s2, v1.s[1]
; CHECK-NO16-NEXT: csel w9, w9, w8, lt
; CHECK-NO16-NEXT: mov s1, v0.s[1]
; CHECK-NO16-NEXT: fcvtzs w15, s0
; CHECK-NO16-NEXT: fcvtzs w13, s0
; CHECK-NO16-NEXT: mov s0, v1.s[2]
; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w9, w9, w11, gt
; CHECK-NO16-NEXT: cmp w10, w8
; CHECK-NO16-NEXT: csel w10, w10, w8, lt
; CHECK-NO16-NEXT: fcvtzs w14, s2
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: fcvtzs w14, s1
; CHECK-NO16-NEXT: mov s1, v0.s[2]
; CHECK-NO16-NEXT: fcvtzs w16, s0
; CHECK-NO16-NEXT: mov s0, v1.s[3]
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
; CHECK-NO16-NEXT: cmp w12, w8
; CHECK-NO16-NEXT: mov s0, v0.s[3]
; CHECK-NO16-NEXT: csel w12, w12, w8, lt
; CHECK-NO16-NEXT: fmov s1, w10
; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w12, w12, w11, gt
; CHECK-NO16-NEXT: cmp w13, w8
; CHECK-NO16-NEXT: fcvtzs w16, s1
; CHECK-NO16-NEXT: csel w13, w13, w8, lt
; CHECK-NO16-NEXT: fmov s1, w10
; CHECK-NO16-NEXT: mov v1.s[1], w9
; CHECK-NO16-NEXT: fcvtzs w9, s0
; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w13, w13, w11, gt
; CHECK-NO16-NEXT: cmp w14, w8
; CHECK-NO16-NEXT: csel w14, w14, w8, lt
; CHECK-NO16-NEXT: mov v1.s[1], w9
; CHECK-NO16-NEXT: fcvtzs w9, s0
; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: mov v1.s[2], w12
; CHECK-NO16-NEXT: csel w14, w14, w11, gt
; CHECK-NO16-NEXT: cmp w15, w8
; CHECK-NO16-NEXT: csel w15, w15, w8, lt
; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: mov v1.s[2], w12
; CHECK-NO16-NEXT: csel w10, w15, w11, gt
; CHECK-NO16-NEXT: cmp w16, w8
; CHECK-NO16-NEXT: mov v1.s[3], w13
; CHECK-NO16-NEXT: fmov s2, w10
; CHECK-NO16-NEXT: csel w10, w16, w8, lt
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
; CHECK-NO16-NEXT: cmp w9, w8
; CHECK-NO16-NEXT: mov v1.s[3], w13
; CHECK-NO16-NEXT: mov v2.s[1], w14
; CHECK-NO16-NEXT: csel w8, w9, w8, lt
; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768
Expand Down
247 changes: 32 additions & 215 deletions llvm/test/CodeGen/AArch64/fdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -169,60 +169,14 @@ entry:
define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fdiv_v7f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[3]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h17, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s2, s3, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h1
; CHECK-SD-NOFP16-NEXT: fdiv s3, s4, s3
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h18, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fdiv s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v18.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s5, s6, s5
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s6, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s7, s16, s7
; CHECK-SD-NOFP16-NEXT: mov h16, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fdiv s3, s17, s16
; CHECK-SD-NOFP16-NEXT: fdiv s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h1, s6
; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h1, s7
; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h1, s3
; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fdiv v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fdiv_v7f16:
Expand Down Expand Up @@ -309,60 +263,14 @@ entry:
define <8 x half> @fdiv_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fdiv_v8f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[3]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h17, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s2, s3, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h1
; CHECK-SD-NOFP16-NEXT: fdiv s3, s4, s3
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h18, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fdiv s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v18.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s5, s6, s5
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s6, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s7, s16, s7
; CHECK-SD-NOFP16-NEXT: mov h16, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fdiv s3, s17, s16
; CHECK-SD-NOFP16-NEXT: fdiv s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h1, s6
; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h1, s7
; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h1, s3
; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v1.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fdiv v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fdiv_v8f16:
Expand Down Expand Up @@ -394,113 +302,22 @@ entry:
define <16 x half> @fdiv_v16f16(<16 x half> %a, <16 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fdiv_v16f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h0
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3]
; CHECK-SD-NOFP16-NEXT: mov h17, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h18, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s20, h1
; CHECK-SD-NOFP16-NEXT: mov h21, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h22, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt s21, h21
; CHECK-SD-NOFP16-NEXT: fcvt s22, h22
; CHECK-SD-NOFP16-NEXT: mov h24, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fdiv s4, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h2
; CHECK-SD-NOFP16-NEXT: fcvt s23, h23
; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s24, h24
; CHECK-SD-NOFP16-NEXT: fcvt s25, h25
; CHECK-SD-NOFP16-NEXT: fdiv s5, s6, s5
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fdiv s7, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fdiv s6, s16, s6
; CHECK-SD-NOFP16-NEXT: mov h16, v2.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fdiv s16, s17, s16
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: fdiv s17, s18, s17
; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fdiv s18, s19, s18
; CHECK-SD-NOFP16-NEXT: fdiv s19, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[1]
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fdiv s2, s2, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h3
; CHECK-SD-NOFP16-NEXT: fdiv s20, s20, s0
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s21, s21, s0
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s22, s22, s0
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s23, s23, s0
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fdiv s24, s24, s0
; CHECK-SD-NOFP16-NEXT: mov h0, v3.h[6]
; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s26, h0
; CHECK-SD-NOFP16-NEXT: fcvt h0, s5
; CHECK-SD-NOFP16-NEXT: fcvt h5, s2
; CHECK-SD-NOFP16-NEXT: fcvt h2, s20
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s7
; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h5, s21
; CHECK-SD-NOFP16-NEXT: fdiv s20, s25, s26
; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s6
; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h5, s22
; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s23
; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fdiv s1, s1, s3
; CHECK-SD-NOFP16-NEXT: fcvt h3, s16
; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s24
; CHECK-SD-NOFP16-NEXT: mov v0.h[4], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s17
; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v4.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s20
; CHECK-SD-NOFP16-NEXT: mov v0.h[5], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s18
; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.h[6], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s19
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: mov v0.h[7], v3.h[0]
; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v1.h[0]
; CHECK-SD-NOFP16-NEXT: mov v1.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: fcvtl2 v4.4s, v2.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v5.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v2.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fdiv v4.4s, v5.4s, v4.4s
; CHECK-SD-NOFP16-NEXT: fcvtl v5.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fdiv v0.4s, v0.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v3.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v3.4s, v3.8h
; CHECK-SD-NOFP16-NEXT: fdiv v2.4s, v5.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v4.4s
; CHECK-SD-NOFP16-NEXT: fdiv v3.4s, v1.4s, v3.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v3.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fdiv_v16f16:
Expand Down
879 changes: 128 additions & 751 deletions llvm/test/CodeGen/AArch64/fmla.ll

Large diffs are not rendered by default.

248 changes: 32 additions & 216 deletions llvm/test/CodeGen/AArch64/fmul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -169,60 +169,14 @@ entry:
define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fmul_v7f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h1
; CHECK-SD-NOFP16-NEXT: fcvt s5, h0
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h16, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s2
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s4
; CHECK-SD-NOFP16-NEXT: fmul s4, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fmul s5, s5, s16
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s3, h6
; CHECK-SD-NOFP16-NEXT: fcvt s6, h7
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fmul s3, s6, s3
; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
; CHECK-SD-NOFP16-NEXT: fmul s6, s16, s7
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v3.h[0]
; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fmul v1.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fmul_v7f16:
Expand Down Expand Up @@ -309,60 +263,14 @@ entry:
define <8 x half> @fmul_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fmul_v8f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h1
; CHECK-SD-NOFP16-NEXT: fcvt s5, h0
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h16, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s2
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s4
; CHECK-SD-NOFP16-NEXT: fmul s4, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h6, v1.h[4]
; CHECK-SD-NOFP16-NEXT: mov h7, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fmul s5, s5, s16
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: mov v2.h[1], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s3, h6
; CHECK-SD-NOFP16-NEXT: fcvt s6, h7
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: mov v2.h[2], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fmul s3, s6, s3
; CHECK-SD-NOFP16-NEXT: mov h6, v0.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: mov v2.h[3], v5.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
; CHECK-SD-NOFP16-NEXT: fmul s6, s16, s7
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: mov v2.h[4], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
; CHECK-SD-NOFP16-NEXT: mov v2.h[5], v3.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: mov v2.h[6], v3.h[0]
; CHECK-SD-NOFP16-NEXT: mov v2.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fmul v1.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fmul_v8f16:
Expand Down Expand Up @@ -394,114 +302,22 @@ entry:
define <16 x half> @fmul_v16f16(<16 x half> %a, <16 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fmul_v16f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
; CHECK-SD-NOFP16-NEXT: mov h6, v3.h[1]
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h2
; CHECK-SD-NOFP16-NEXT: fcvt s17, h0
; CHECK-SD-NOFP16-NEXT: mov h18, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt s20, h3
; CHECK-SD-NOFP16-NEXT: fcvt s21, h1
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2]
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
; CHECK-SD-NOFP16-NEXT: fmul s16, s17, s16
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov h24, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s17, h18
; CHECK-SD-NOFP16-NEXT: fcvt s18, h19
; CHECK-SD-NOFP16-NEXT: mov h19, v2.h[3]
; CHECK-SD-NOFP16-NEXT: fmul s20, s21, s20
; CHECK-SD-NOFP16-NEXT: fcvt s21, h22
; CHECK-SD-NOFP16-NEXT: fcvt s22, h23
; CHECK-SD-NOFP16-NEXT: fmul s5, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[3]
; CHECK-SD-NOFP16-NEXT: mov h25, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fmul s6, s7, s6
; CHECK-SD-NOFP16-NEXT: mov h7, v3.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt s24, h24
; CHECK-SD-NOFP16-NEXT: fmul s17, s18, s17
; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
; CHECK-SD-NOFP16-NEXT: fcvt h4, s16
; CHECK-SD-NOFP16-NEXT: fmul s18, s22, s21
; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h16, s5
; CHECK-SD-NOFP16-NEXT: fcvt h5, s20
; CHECK-SD-NOFP16-NEXT: fcvt s21, h23
; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
; CHECK-SD-NOFP16-NEXT: mov h20, v2.h[4]
; CHECK-SD-NOFP16-NEXT: fmul s19, s24, s19
; CHECK-SD-NOFP16-NEXT: fcvt h17, s17
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[4]
; CHECK-SD-NOFP16-NEXT: mov h24, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov v4.h[1], v16.h[0]
; CHECK-SD-NOFP16-NEXT: mov h16, v3.h[4]
; CHECK-SD-NOFP16-NEXT: fmul s7, s21, s7
; CHECK-SD-NOFP16-NEXT: mov v5.h[1], v6.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s18
; CHECK-SD-NOFP16-NEXT: fcvt s18, h20
; CHECK-SD-NOFP16-NEXT: fcvt h19, s19
; CHECK-SD-NOFP16-NEXT: fcvt s20, h22
; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[5]
; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: mov v4.h[2], v17.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
; CHECK-SD-NOFP16-NEXT: fcvt s17, h23
; CHECK-SD-NOFP16-NEXT: mov v5.h[2], v6.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s7
; CHECK-SD-NOFP16-NEXT: mov h7, v3.h[5]
; CHECK-SD-NOFP16-NEXT: fmul s18, s20, s18
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h20, v2.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s16, s17, s16
; CHECK-SD-NOFP16-NEXT: mov h17, v3.h[6]
; CHECK-SD-NOFP16-NEXT: mov v4.h[3], v19.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s19, h21
; CHECK-SD-NOFP16-NEXT: fcvt s21, h22
; CHECK-SD-NOFP16-NEXT: mov v5.h[3], v6.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt s6, h7
; CHECK-SD-NOFP16-NEXT: fcvt s7, h23
; CHECK-SD-NOFP16-NEXT: fcvt h18, s18
; CHECK-SD-NOFP16-NEXT: fcvt s20, h20
; CHECK-SD-NOFP16-NEXT: fcvt s22, h24
; CHECK-SD-NOFP16-NEXT: fcvt s23, h25
; CHECK-SD-NOFP16-NEXT: fcvt h16, s16
; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7]
; CHECK-SD-NOFP16-NEXT: fmul s19, s21, s19
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fmul s6, s7, s6
; CHECK-SD-NOFP16-NEXT: mov v4.h[4], v18.h[0]
; CHECK-SD-NOFP16-NEXT: fmul s7, s22, s20
; CHECK-SD-NOFP16-NEXT: mov v5.h[4], v16.h[0]
; CHECK-SD-NOFP16-NEXT: fmul s16, s23, s17
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: fcvt h17, s19
; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
; CHECK-SD-NOFP16-NEXT: fcvt h2, s7
; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s3
; CHECK-SD-NOFP16-NEXT: mov v4.h[5], v17.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: mov v5.h[5], v6.h[0]
; CHECK-SD-NOFP16-NEXT: fcvt h6, s16
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: mov v4.h[6], v2.h[0]
; CHECK-SD-NOFP16-NEXT: mov v5.h[6], v6.h[0]
; CHECK-SD-NOFP16-NEXT: mov v4.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v5.h[7], v1.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v4.16b
; CHECK-SD-NOFP16-NEXT: mov v1.16b, v5.16b
; CHECK-SD-NOFP16-NEXT: fcvtl v4.4s, v2.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v5.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v6.4s, v3.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v7.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v2.4s, v2.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v3.4s, v3.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fmul v4.4s, v5.4s, v4.4s
; CHECK-SD-NOFP16-NEXT: fmul v5.4s, v7.4s, v6.4s
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v0.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fmul v3.4s, v1.4s, v3.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v0.4h, v4.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v5.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v3.4s
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fmul_v16f16:
Expand Down
298 changes: 44 additions & 254 deletions llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,60 +5,14 @@
define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
; CHECK-CVT-LABEL: add_h:
; CHECK-CVT: // %bb.0: // %entry
; CHECK-CVT-NEXT: mov h2, v1.h[1]
; CHECK-CVT-NEXT: mov h3, v0.h[1]
; CHECK-CVT-NEXT: fcvt s4, h1
; CHECK-CVT-NEXT: fcvt s5, h0
; CHECK-CVT-NEXT: mov h6, v1.h[2]
; CHECK-CVT-NEXT: mov h7, v0.h[2]
; CHECK-CVT-NEXT: mov h16, v1.h[3]
; CHECK-CVT-NEXT: fcvt s2, h2
; CHECK-CVT-NEXT: fcvt s3, h3
; CHECK-CVT-NEXT: fadd s4, s5, s4
; CHECK-CVT-NEXT: mov h5, v0.h[3]
; CHECK-CVT-NEXT: fcvt s6, h6
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: fadd s3, s3, s2
; CHECK-CVT-NEXT: fcvt s5, h5
; CHECK-CVT-NEXT: fcvt h2, s4
; CHECK-CVT-NEXT: fadd s4, s7, s6
; CHECK-CVT-NEXT: mov h6, v1.h[4]
; CHECK-CVT-NEXT: mov h7, v0.h[4]
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fadd s5, s5, s16
; CHECK-CVT-NEXT: mov h16, v0.h[5]
; CHECK-CVT-NEXT: fcvt h4, s4
; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0]
; CHECK-CVT-NEXT: fcvt s3, h6
; CHECK-CVT-NEXT: fcvt s6, h7
; CHECK-CVT-NEXT: mov h7, v1.h[5]
; CHECK-CVT-NEXT: fcvt h5, s5
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: mov v2.h[2], v4.h[0]
; CHECK-CVT-NEXT: mov h4, v1.h[6]
; CHECK-CVT-NEXT: fadd s3, s6, s3
; CHECK-CVT-NEXT: mov h6, v0.h[6]
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: mov h1, v1.h[7]
; CHECK-CVT-NEXT: mov h0, v0.h[7]
; CHECK-CVT-NEXT: mov v2.h[3], v5.h[0]
; CHECK-CVT-NEXT: fcvt s4, h4
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fcvt s5, h6
; CHECK-CVT-NEXT: fadd s6, s16, s7
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fcvt s0, h0
; CHECK-CVT-NEXT: mov v2.h[4], v3.h[0]
; CHECK-CVT-NEXT: fadd s4, s5, s4
; CHECK-CVT-NEXT: fcvt h3, s6
; CHECK-CVT-NEXT: fadd s0, s0, s1
; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0]
; CHECK-CVT-NEXT: fcvt h3, s4
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0]
; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0]
; CHECK-CVT-NEXT: mov v0.16b, v2.16b
; CHECK-CVT-NEXT: fcvtl v2.4s, v1.4h
; CHECK-CVT-NEXT: fcvtl v3.4s, v0.4h
; CHECK-CVT-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-CVT-NEXT: fadd v2.4s, v3.4s, v2.4s
; CHECK-CVT-NEXT: fadd v1.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: fcvtn v0.4h, v2.4s
; CHECK-CVT-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: add_h:
Expand All @@ -74,60 +28,14 @@ entry:
define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
; CHECK-CVT-LABEL: sub_h:
; CHECK-CVT: // %bb.0: // %entry
; CHECK-CVT-NEXT: mov h2, v1.h[1]
; CHECK-CVT-NEXT: mov h3, v0.h[1]
; CHECK-CVT-NEXT: fcvt s4, h1
; CHECK-CVT-NEXT: fcvt s5, h0
; CHECK-CVT-NEXT: mov h6, v1.h[2]
; CHECK-CVT-NEXT: mov h7, v0.h[2]
; CHECK-CVT-NEXT: mov h16, v1.h[3]
; CHECK-CVT-NEXT: fcvt s2, h2
; CHECK-CVT-NEXT: fcvt s3, h3
; CHECK-CVT-NEXT: fsub s4, s5, s4
; CHECK-CVT-NEXT: mov h5, v0.h[3]
; CHECK-CVT-NEXT: fcvt s6, h6
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: fsub s3, s3, s2
; CHECK-CVT-NEXT: fcvt s5, h5
; CHECK-CVT-NEXT: fcvt h2, s4
; CHECK-CVT-NEXT: fsub s4, s7, s6
; CHECK-CVT-NEXT: mov h6, v1.h[4]
; CHECK-CVT-NEXT: mov h7, v0.h[4]
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fsub s5, s5, s16
; CHECK-CVT-NEXT: mov h16, v0.h[5]
; CHECK-CVT-NEXT: fcvt h4, s4
; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0]
; CHECK-CVT-NEXT: fcvt s3, h6
; CHECK-CVT-NEXT: fcvt s6, h7
; CHECK-CVT-NEXT: mov h7, v1.h[5]
; CHECK-CVT-NEXT: fcvt h5, s5
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: mov v2.h[2], v4.h[0]
; CHECK-CVT-NEXT: mov h4, v1.h[6]
; CHECK-CVT-NEXT: fsub s3, s6, s3
; CHECK-CVT-NEXT: mov h6, v0.h[6]
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: mov h1, v1.h[7]
; CHECK-CVT-NEXT: mov h0, v0.h[7]
; CHECK-CVT-NEXT: mov v2.h[3], v5.h[0]
; CHECK-CVT-NEXT: fcvt s4, h4
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fcvt s5, h6
; CHECK-CVT-NEXT: fsub s6, s16, s7
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fcvt s0, h0
; CHECK-CVT-NEXT: mov v2.h[4], v3.h[0]
; CHECK-CVT-NEXT: fsub s4, s5, s4
; CHECK-CVT-NEXT: fcvt h3, s6
; CHECK-CVT-NEXT: fsub s0, s0, s1
; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0]
; CHECK-CVT-NEXT: fcvt h3, s4
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0]
; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0]
; CHECK-CVT-NEXT: mov v0.16b, v2.16b
; CHECK-CVT-NEXT: fcvtl v2.4s, v1.4h
; CHECK-CVT-NEXT: fcvtl v3.4s, v0.4h
; CHECK-CVT-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-CVT-NEXT: fsub v2.4s, v3.4s, v2.4s
; CHECK-CVT-NEXT: fsub v1.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: fcvtn v0.4h, v2.4s
; CHECK-CVT-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: sub_h:
Expand All @@ -143,60 +51,14 @@ entry:
define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
; CHECK-CVT-LABEL: mul_h:
; CHECK-CVT: // %bb.0: // %entry
; CHECK-CVT-NEXT: mov h2, v1.h[1]
; CHECK-CVT-NEXT: mov h3, v0.h[1]
; CHECK-CVT-NEXT: fcvt s4, h1
; CHECK-CVT-NEXT: fcvt s5, h0
; CHECK-CVT-NEXT: mov h6, v1.h[2]
; CHECK-CVT-NEXT: mov h7, v0.h[2]
; CHECK-CVT-NEXT: mov h16, v1.h[3]
; CHECK-CVT-NEXT: fcvt s2, h2
; CHECK-CVT-NEXT: fcvt s3, h3
; CHECK-CVT-NEXT: fmul s4, s5, s4
; CHECK-CVT-NEXT: mov h5, v0.h[3]
; CHECK-CVT-NEXT: fcvt s6, h6
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: fmul s3, s3, s2
; CHECK-CVT-NEXT: fcvt s5, h5
; CHECK-CVT-NEXT: fcvt h2, s4
; CHECK-CVT-NEXT: fmul s4, s7, s6
; CHECK-CVT-NEXT: mov h6, v1.h[4]
; CHECK-CVT-NEXT: mov h7, v0.h[4]
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fmul s5, s5, s16
; CHECK-CVT-NEXT: mov h16, v0.h[5]
; CHECK-CVT-NEXT: fcvt h4, s4
; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0]
; CHECK-CVT-NEXT: fcvt s3, h6
; CHECK-CVT-NEXT: fcvt s6, h7
; CHECK-CVT-NEXT: mov h7, v1.h[5]
; CHECK-CVT-NEXT: fcvt h5, s5
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: mov v2.h[2], v4.h[0]
; CHECK-CVT-NEXT: mov h4, v1.h[6]
; CHECK-CVT-NEXT: fmul s3, s6, s3
; CHECK-CVT-NEXT: mov h6, v0.h[6]
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: mov h1, v1.h[7]
; CHECK-CVT-NEXT: mov h0, v0.h[7]
; CHECK-CVT-NEXT: mov v2.h[3], v5.h[0]
; CHECK-CVT-NEXT: fcvt s4, h4
; CHECK-CVT-NEXT: fcvt h3, s3
; CHECK-CVT-NEXT: fcvt s5, h6
; CHECK-CVT-NEXT: fmul s6, s16, s7
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fcvt s0, h0
; CHECK-CVT-NEXT: mov v2.h[4], v3.h[0]
; CHECK-CVT-NEXT: fmul s4, s5, s4
; CHECK-CVT-NEXT: fcvt h3, s6
; CHECK-CVT-NEXT: fmul s0, s0, s1
; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0]
; CHECK-CVT-NEXT: fcvt h3, s4
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0]
; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0]
; CHECK-CVT-NEXT: mov v0.16b, v2.16b
; CHECK-CVT-NEXT: fcvtl v2.4s, v1.4h
; CHECK-CVT-NEXT: fcvtl v3.4s, v0.4h
; CHECK-CVT-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-CVT-NEXT: fmul v2.4s, v3.4s, v2.4s
; CHECK-CVT-NEXT: fmul v1.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: fcvtn v0.4h, v2.4s
; CHECK-CVT-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: mul_h:
Expand All @@ -212,60 +74,14 @@ entry:
define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
; CHECK-CVT-LABEL: div_h:
; CHECK-CVT: // %bb.0: // %entry
; CHECK-CVT-NEXT: mov h2, v1.h[1]
; CHECK-CVT-NEXT: mov h3, v0.h[1]
; CHECK-CVT-NEXT: fcvt s4, h0
; CHECK-CVT-NEXT: mov h5, v0.h[2]
; CHECK-CVT-NEXT: mov h6, v0.h[3]
; CHECK-CVT-NEXT: mov h7, v0.h[4]
; CHECK-CVT-NEXT: mov h16, v0.h[5]
; CHECK-CVT-NEXT: mov h17, v0.h[6]
; CHECK-CVT-NEXT: mov h0, v0.h[7]
; CHECK-CVT-NEXT: fcvt s2, h2
; CHECK-CVT-NEXT: fcvt s3, h3
; CHECK-CVT-NEXT: fcvt s5, h5
; CHECK-CVT-NEXT: fcvt s6, h6
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: fcvt s17, h17
; CHECK-CVT-NEXT: fcvt s0, h0
; CHECK-CVT-NEXT: fdiv s2, s3, s2
; CHECK-CVT-NEXT: fcvt s3, h1
; CHECK-CVT-NEXT: fdiv s3, s4, s3
; CHECK-CVT-NEXT: mov h4, v1.h[2]
; CHECK-CVT-NEXT: fcvt h18, s2
; CHECK-CVT-NEXT: fcvt s4, h4
; CHECK-CVT-NEXT: fdiv s4, s5, s4
; CHECK-CVT-NEXT: mov h5, v1.h[3]
; CHECK-CVT-NEXT: fcvt h2, s3
; CHECK-CVT-NEXT: fcvt s5, h5
; CHECK-CVT-NEXT: mov v2.h[1], v18.h[0]
; CHECK-CVT-NEXT: fdiv s5, s6, s5
; CHECK-CVT-NEXT: mov h6, v1.h[4]
; CHECK-CVT-NEXT: fcvt h4, s4
; CHECK-CVT-NEXT: fcvt s6, h6
; CHECK-CVT-NEXT: mov v2.h[2], v4.h[0]
; CHECK-CVT-NEXT: fdiv s6, s7, s6
; CHECK-CVT-NEXT: mov h7, v1.h[5]
; CHECK-CVT-NEXT: fcvt h4, s5
; CHECK-CVT-NEXT: fcvt s7, h7
; CHECK-CVT-NEXT: mov v2.h[3], v4.h[0]
; CHECK-CVT-NEXT: fdiv s7, s16, s7
; CHECK-CVT-NEXT: mov h16, v1.h[6]
; CHECK-CVT-NEXT: mov h1, v1.h[7]
; CHECK-CVT-NEXT: fcvt s16, h16
; CHECK-CVT-NEXT: fcvt s1, h1
; CHECK-CVT-NEXT: fdiv s3, s17, s16
; CHECK-CVT-NEXT: fdiv s0, s0, s1
; CHECK-CVT-NEXT: fcvt h1, s6
; CHECK-CVT-NEXT: mov v2.h[4], v1.h[0]
; CHECK-CVT-NEXT: fcvt h1, s7
; CHECK-CVT-NEXT: mov v2.h[5], v1.h[0]
; CHECK-CVT-NEXT: fcvt h1, s3
; CHECK-CVT-NEXT: mov v2.h[6], v1.h[0]
; CHECK-CVT-NEXT: fcvt h0, s0
; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0]
; CHECK-CVT-NEXT: mov v0.16b, v2.16b
; CHECK-CVT-NEXT: fcvtl v2.4s, v1.4h
; CHECK-CVT-NEXT: fcvtl v3.4s, v0.4h
; CHECK-CVT-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-CVT-NEXT: fdiv v2.4s, v3.4s, v2.4s
; CHECK-CVT-NEXT: fdiv v1.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: fcvtn v0.4h, v2.4s
; CHECK-CVT-NEXT: fcvtn2 v0.8h, v1.4s
; CHECK-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: div_h:
Expand Down Expand Up @@ -312,25 +128,12 @@ define <8 x half> @s_to_h(<8 x float> %a) {
define <8 x half> @d_to_h(<8 x double> %a) {
; CHECK-LABEL: d_to_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov d5, v0.d[1]
; CHECK-NEXT: fcvt h0, d0
; CHECK-NEXT: fcvt h4, d1
; CHECK-NEXT: mov d1, v1.d[1]
; CHECK-NEXT: fcvt h5, d5
; CHECK-NEXT: fcvt h1, d1
; CHECK-NEXT: mov v0.h[1], v5.h[0]
; CHECK-NEXT: mov v0.h[2], v4.h[0]
; CHECK-NEXT: mov v0.h[3], v1.h[0]
; CHECK-NEXT: fcvt h1, d2
; CHECK-NEXT: mov d2, v2.d[1]
; CHECK-NEXT: mov v0.h[4], v1.h[0]
; CHECK-NEXT: fcvt h1, d2
; CHECK-NEXT: mov d2, v3.d[1]
; CHECK-NEXT: mov v0.h[5], v1.h[0]
; CHECK-NEXT: fcvt h1, d3
; CHECK-NEXT: mov v0.h[6], v1.h[0]
; CHECK-NEXT: fcvt h1, d2
; CHECK-NEXT: mov v0.h[7], v1.h[0]
; CHECK-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-NEXT: fcvtxn v2.2s, v2.2d
; CHECK-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-NEXT: fcvtxn2 v2.4s, v3.2d
; CHECK-NEXT: fcvtn v0.4h, v0.4s
; CHECK-NEXT: fcvtn2 v0.8h, v2.4s
; CHECK-NEXT: ret
%1 = fptrunc <8 x double> %a to <8 x half>
ret <8 x half> %1
Expand All @@ -349,25 +152,12 @@ define <8 x float> @h_to_s(<8 x half> %a) {
define <8 x double> @h_to_d(<8 x half> %a) {
; CHECK-LABEL: h_to_d:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: mov h1, v0.h[1]
; CHECK-NEXT: mov h3, v0.h[3]
; CHECK-NEXT: mov h4, v0.h[2]
; CHECK-NEXT: fcvt d0, h0
; CHECK-NEXT: mov h5, v2.h[1]
; CHECK-NEXT: mov h6, v2.h[3]
; CHECK-NEXT: mov h7, v2.h[2]
; CHECK-NEXT: fcvt d16, h1
; CHECK-NEXT: fcvt d17, h3
; CHECK-NEXT: fcvt d1, h4
; CHECK-NEXT: fcvt d2, h2
; CHECK-NEXT: fcvt d4, h5
; CHECK-NEXT: fcvt d5, h6
; CHECK-NEXT: fcvt d3, h7
; CHECK-NEXT: mov v0.d[1], v16.d[0]
; CHECK-NEXT: mov v1.d[1], v17.d[0]
; CHECK-NEXT: mov v2.d[1], v4.d[0]
; CHECK-NEXT: mov v3.d[1], v5.d[0]
; CHECK-NEXT: fcvtl v1.4s, v0.4h
; CHECK-NEXT: fcvtl2 v2.4s, v0.8h
; CHECK-NEXT: fcvtl v0.2d, v1.2s
; CHECK-NEXT: fcvtl2 v3.2d, v2.4s
; CHECK-NEXT: fcvtl2 v1.2d, v1.4s
; CHECK-NEXT: fcvtl v2.2d, v2.2s
; CHECK-NEXT: ret
%1 = fpext <8 x half> %a to <8 x double>
ret <8 x double> %1
Expand Down
64 changes: 37 additions & 27 deletions llvm/test/CodeGen/AArch64/fpext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -85,29 +85,46 @@ entry:
}

define <2 x double> @fpext_v2f16_v2f64(<2 x half> %a) {
; CHECK-LABEL: fpext_v2f16_v2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h1, v0.h[1]
; CHECK-NEXT: fcvt d0, h0
; CHECK-NEXT: fcvt d1, h1
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
; CHECK-SD-LABEL: fpext_v2f16_v2f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fpext_v2f16_v2f64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: fcvt d0, h0
; CHECK-GI-NEXT: fcvt d1, h1
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ret
entry:
%c = fpext <2 x half> %a to <2 x double>
ret <2 x double> %c
}

define <3 x double> @fpext_v3f16_v3f64(<3 x half> %a) {
; CHECK-LABEL: fpext_v3f16_v3f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h1, v0.h[1]
; CHECK-NEXT: mov h2, v0.h[2]
; CHECK-NEXT: fcvt d0, h0
; CHECK-NEXT: fcvt d1, h1
; CHECK-NEXT: fcvt d2, h2
; CHECK-NEXT: ret
; CHECK-SD-LABEL: fpext_v3f16_v3f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fcvtl v1.4s, v0.4h
; CHECK-SD-NEXT: fcvtl v0.2d, v1.2s
; CHECK-SD-NEXT: fcvtl2 v2.2d, v1.4s
; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fpext_v3f16_v3f64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov h2, v0.h[2]
; CHECK-GI-NEXT: fcvt d0, h0
; CHECK-GI-NEXT: fcvt d1, h1
; CHECK-GI-NEXT: fcvt d2, h2
; CHECK-GI-NEXT: ret
entry:
%c = fpext <3 x half> %a to <3 x double>
ret <3 x double> %c
Expand All @@ -116,16 +133,9 @@ entry:
define <4 x double> @fpext_v4f16_v4f64(<4 x half> %a) {
; CHECK-SD-LABEL: fpext_v4f16_v4f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov h1, v0.h[1]
; CHECK-SD-NEXT: mov h2, v0.h[3]
; CHECK-SD-NEXT: mov h3, v0.h[2]
; CHECK-SD-NEXT: fcvt d0, h0
; CHECK-SD-NEXT: fcvt d4, h1
; CHECK-SD-NEXT: fcvt d2, h2
; CHECK-SD-NEXT: fcvt d1, h3
; CHECK-SD-NEXT: mov v0.d[1], v4.d[0]
; CHECK-SD-NEXT: mov v1.d[1], v2.d[0]
; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h
; CHECK-SD-NEXT: fcvtl2 v1.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fpext_v4f16_v4f64:
Expand Down
20 changes: 5 additions & 15 deletions llvm/test/CodeGen/AArch64/fptrunc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,8 @@ entry:
define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
; CHECK-SD-LABEL: fptrunc_v2f64_v2f16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov d1, v0.d[1]
; CHECK-SD-NEXT: fcvt h0, d0
; CHECK-SD-NEXT: fcvt h1, d1
; CHECK-SD-NEXT: mov v0.h[1], v1.h[0]
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptrunc_v2f64_v2f16:
Expand Down Expand Up @@ -135,16 +132,9 @@ entry:
define <4 x half> @fptrunc_v4f64_v4f16(<4 x double> %a) {
; CHECK-SD-LABEL: fptrunc_v4f64_v4f16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov d2, v0.d[1]
; CHECK-SD-NEXT: fcvt h0, d0
; CHECK-SD-NEXT: fcvt h2, d2
; CHECK-SD-NEXT: mov v0.h[1], v2.h[0]
; CHECK-SD-NEXT: fcvt h2, d1
; CHECK-SD-NEXT: mov d1, v1.d[1]
; CHECK-SD-NEXT: mov v0.h[2], v2.h[0]
; CHECK-SD-NEXT: fcvt h1, d1
; CHECK-SD-NEXT: mov v0.h[3], v1.h[0]
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d
; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptrunc_v4f64_v4f16:
Expand Down
5 changes: 0 additions & 5 deletions llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ define <vscale x 8 x half> @fdiv_recip_8f16(<vscale x 8 x half> %a, <vscale x 8
; CHECK-LABEL: fdiv_recip_8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: frecpe z2.h, z1.h
; CHECK-NEXT: frecps z3.h, z1.h, z2.h
; CHECK-NEXT: fmul z2.h, z2.h, z3.h
; CHECK-NEXT: frecps z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z2.h, z1.h
; CHECK-NEXT: fmul z0.h, z1.h, z0.h
Expand Down Expand Up @@ -98,9 +96,6 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, #0.0
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z2.h, z1.h, z1.h
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> %a)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,49 +427,35 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
; SVE-LABEL: test_copysign_v4f16_v4f64:
; SVE: // %bb.0:
; SVE-NEXT: sub sp, sp, #16
; SVE-NEXT: .cfi_def_cfa_offset 16
; SVE-NEXT: ldp q1, q0, [x1]
; SVE-NEXT: ldr d4, [x0]
; SVE-NEXT: and z4.h, z4.h, #0x7fff
; SVE-NEXT: mov z2.d, z0.d[1]
; SVE-NEXT: mov z3.d, z1.d[1]
; SVE-NEXT: fcvt h0, d0
; SVE-NEXT: fcvt h1, d1
; SVE-NEXT: fcvt h2, d2
; SVE-NEXT: fcvt h3, d3
; SVE-NEXT: str h0, [sp, #12]
; SVE-NEXT: str h1, [sp, #8]
; SVE-NEXT: str h2, [sp, #14]
; SVE-NEXT: str h3, [sp, #10]
; SVE-NEXT: ldr d0, [sp, #8]
; SVE-NEXT: ldp q0, q1, [x1]
; SVE-NEXT: ptrue p0.s, vl2
; SVE-NEXT: ptrue p1.s
; SVE-NEXT: fcvtxn v1.2s, v1.2d
; SVE-NEXT: fcvtxn v0.2s, v0.2d
; SVE-NEXT: splice z0.s, p0, z0.s, z1.s
; SVE-NEXT: ldr d1, [x0]
; SVE-NEXT: and z1.h, z1.h, #0x7fff
; SVE-NEXT: fcvt z0.h, p1/m, z0.s
; SVE-NEXT: uzp1 z0.h, z0.h, z0.h
; SVE-NEXT: and z0.h, z0.h, #0x8000
; SVE-NEXT: orr z0.d, z4.d, z0.d
; SVE-NEXT: orr z0.d, z1.d, z0.d
; SVE-NEXT: str d0, [x0]
; SVE-NEXT: add sp, sp, #16
; SVE-NEXT: ret
;
; SVE2-LABEL: test_copysign_v4f16_v4f64:
; SVE2: // %bb.0:
; SVE2-NEXT: sub sp, sp, #16
; SVE2-NEXT: .cfi_def_cfa_offset 16
; SVE2-NEXT: ldp q2, q1, [x1]
; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
; SVE2-NEXT: ldr d5, [x0]
; SVE2-NEXT: mov z3.d, z1.d[1]
; SVE2-NEXT: mov z4.d, z2.d[1]
; SVE2-NEXT: fcvt h1, d1
; SVE2-NEXT: fcvt h2, d2
; SVE2-NEXT: fcvt h3, d3
; SVE2-NEXT: fcvt h4, d4
; SVE2-NEXT: str h1, [sp, #12]
; SVE2-NEXT: str h2, [sp, #8]
; SVE2-NEXT: str h3, [sp, #14]
; SVE2-NEXT: str h4, [sp, #10]
; SVE2-NEXT: ldr d1, [sp, #8]
; SVE2-NEXT: bsl z5.d, z5.d, z1.d, z0.d
; SVE2-NEXT: str d5, [x0]
; SVE2-NEXT: add sp, sp, #16
; SVE2-NEXT: ldp q0, q1, [x1]
; SVE2-NEXT: ptrue p0.s, vl2
; SVE2-NEXT: ptrue p1.s
; SVE2-NEXT: ldr d2, [x0]
; SVE2-NEXT: fcvtxn v1.2s, v1.2d
; SVE2-NEXT: fcvtxn v0.2s, v0.2d
; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s
; SVE2-NEXT: mov z1.h, #32767 // =0x7fff
; SVE2-NEXT: fcvt z0.h, p1/m, z0.s
; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h
; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d
; SVE2-NEXT: str d2, [x0]
; SVE2-NEXT: ret
%a = load <4 x half>, ptr %ap
%b = load <4 x double>, ptr %bp
Expand Down
196 changes: 67 additions & 129 deletions llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -147,79 +147,48 @@ define double @add_D(<2 x double> %bin.rdx) {
define half @add_2H(<16 x half> %bin.rdx) {
; CHECK-SD-NOFP16-LABEL: add_2H:
; CHECK-SD-NOFP16: // %bb.0:
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h1
; CHECK-SD-NOFP16-NEXT: fcvt s5, h0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fadd s2, s4, s2
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
;
Expand Down Expand Up @@ -574,78 +543,47 @@ exit:
define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fadd_reduct_reassoc_v8f16:
; CHECK-SD-NOFP16: // %bb.0:
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
; CHECK-SD-NOFP16-NEXT: fcvt s5, h1
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s4, s2
; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fadd s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fadd s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fadd s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fadd s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fadd s3, s3, s5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0
; CHECK-SD-NOFP16-NEXT: fadd s1, s3, s1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
Expand Down
196 changes: 67 additions & 129 deletions llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -182,79 +182,48 @@ define double @mul_D(<2 x double> %bin.rdx) {
define half @mul_2H(<16 x half> %bin.rdx) {
; CHECK-SD-NOFP16-LABEL: mul_2H:
; CHECK-SD-NOFP16: // %bb.0:
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h1
; CHECK-SD-NOFP16-NEXT: fcvt s5, h0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
; CHECK-SD-NOFP16-NEXT: fmul s2, s3, s2
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fmul s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[3]
; CHECK-SD-NOFP16-NEXT: fmul s2, s4, s2
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[4]
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fmul s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[5]
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fmul s4, s5, s4
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[6]
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fmul s3, s5, s4
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fmul s0, s1, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
;
Expand Down Expand Up @@ -361,78 +330,47 @@ define float @mul_S_init_42(<4 x float> %bin.rdx) {
define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-SD-NOFP16-LABEL: fmul_reduct_reassoc_v8f16:
; CHECK-SD-NOFP16: // %bb.0:
; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s4, h0
; CHECK-SD-NOFP16-NEXT: fcvt s5, h1
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s4, s2
; CHECK-SD-NOFP16-NEXT: fmul s3, s5, s3
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[3]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h
; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s
; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s
; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s
; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1]
; CHECK-SD-NOFP16-NEXT: fcvt s2, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[5]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s5
; CHECK-SD-NOFP16-NEXT: mov h4, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h5, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s2, s2, s4
; CHECK-SD-NOFP16-NEXT: fmul s3, s3, s5
; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0
; CHECK-SD-NOFP16-NEXT: fmul s1, s3, s1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
Expand Down
62 changes: 17 additions & 45 deletions llvm/test/CodeGen/AArch64/vector-fcopysign.ll
Original file line number Diff line number Diff line change
Expand Up @@ -209,16 +209,10 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4f16_v4f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov d3, v1[1]
; CHECK-NEXT: fcvt h1, d1
; CHECK-NEXT: fcvt h3, d3
; CHECK-NEXT: mov.h v1[1], v3[0]
; CHECK-NEXT: fcvt h3, d2
; CHECK-NEXT: mov d2, v2[1]
; CHECK-NEXT: mov.h v1[2], v3[0]
; CHECK-NEXT: fcvt h2, d2
; CHECK-NEXT: mov.h v1[3], v2[0]
; CHECK-NEXT: fcvtxn v1.2s, v1.2d
; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d
; CHECK-NEXT: mvni.4h v2, #128, lsl #8
; CHECK-NEXT: fcvtn v1.4h, v1.4s
; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x half>
Expand Down Expand Up @@ -291,42 +285,20 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b)
define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b) #0 {
; CHECK-LABEL: test_copysign_v4bf16_v4f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: mov d3, v1[1]
; CHECK-NEXT: fcvtxn s1, d1
; CHECK-NEXT: mov w8, #32767 ; =0x7fff
; CHECK-NEXT: fcvtxn s3, d3
; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: ubfx w12, w10, #16, #1
; CHECK-NEXT: add w10, w10, w8
; CHECK-NEXT: fmov w9, s3
; CHECK-NEXT: fcvtxn s3, d2
; CHECK-NEXT: mov d2, v2[1]
; CHECK-NEXT: add w10, w12, w10
; CHECK-NEXT: lsr w10, w10, #16
; CHECK-NEXT: ubfx w11, w9, #16, #1
; CHECK-NEXT: add w9, w9, w8
; CHECK-NEXT: fcvtxn s1, d2
; CHECK-NEXT: add w9, w11, w9
; CHECK-NEXT: fmov w11, s3
; CHECK-NEXT: fmov s3, w10
; CHECK-NEXT: lsr w9, w9, #16
; CHECK-NEXT: ubfx w12, w11, #16, #1
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: add w9, w11, w8
; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: add w9, w12, w9
; CHECK-NEXT: lsr w9, w9, #16
; CHECK-NEXT: mov.h v3[1], v2[0]
; CHECK-NEXT: ubfx w11, w10, #16, #1
; CHECK-NEXT: add w8, w10, w8
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: add w8, w11, w8
; CHECK-NEXT: lsr w8, w8, #16
; CHECK-NEXT: mov.h v3[2], v1[0]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: mov.h v3[3], v1[0]
; CHECK-NEXT: mvni.4h v1, #128, lsl #8
; CHECK-NEXT: bif.8b v0, v3, v1
; CHECK-NEXT: fcvtxn v1.2s, v1.2d
; CHECK-NEXT: movi.4s v3, #1
; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d
; CHECK-NEXT: movi.4s v2, #127, msl #8
; CHECK-NEXT: ushr.4s v4, v1, #16
; CHECK-NEXT: add.4s v2, v1, v2
; CHECK-NEXT: and.16b v3, v4, v3
; CHECK-NEXT: add.4s v2, v3, v2
; CHECK-NEXT: fcmeq.4s v3, v1, v1
; CHECK-NEXT: orr.4s v1, #64, lsl #16
; CHECK-NEXT: bit.16b v1, v2, v3
; CHECK-NEXT: mvni.4h v2, #128, lsl #8
; CHECK-NEXT: shrn.4h v1, v1, #16
; CHECK-NEXT: bif.8b v0, v1, v2
; CHECK-NEXT: ret
%tmp0 = fptrunc <4 x double> %b to <4 x bfloat>
%r = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %tmp0)
Expand Down