diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b26..da6c65f2c1c7d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( case MVT::f64: return true; case MVT::bf16: - return VT.isScalableVector() && Subtarget->hasSVEB16B16() && + return VT.isScalableVector() && Subtarget->hasBF16() && Subtarget->isNonStreamingSVEorSME2Available(); default: break; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dcbca600..ece012a035bfc 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2578,6 +2578,11 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>; defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>; + def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))), + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))), + (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>; + defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>; defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>; } // End HasBF16, HasSVE_or_SME diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll index 0580f5e0b019a..582e8456c05b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll @@ -466,12 +466,10 @@ define @fmla_nxv2bf16( %a, @fmla_nxv4bf16( %a, %b, %c) { ; NOB16B16-LABEL: fmla_nxv4bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 ; NOB16B16-NEXT: ptrue p0.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv4bf16: @@ -486,24 +484,20 @@ define @fmla_nxv4bf16( %a, @fmla_nxv8bf16( %a, %b, %c) { ; NOB16B16-LABEL: fmla_nxv8bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: uunpkhi z3.s, z1.h -; NOB16B16-NEXT: uunpkhi z4.s, z0.h -; NOB16B16-NEXT: uunpkhi z5.s, z2.h +; NOB16B16-NEXT: uunpkhi z3.s, z2.h +; NOB16B16-NEXT: uunpklo z2.s, z2.h +; NOB16B16-NEXT: uunpkhi z4.s, z1.h +; NOB16B16-NEXT: uunpkhi z5.s, z0.h ; NOB16B16-NEXT: uunpklo z1.s, z1.h ; NOB16B16-NEXT: uunpklo z0.s, z0.h -; NOB16B16-NEXT: uunpklo z2.s, z2.h ; NOB16B16-NEXT: ptrue p0.s ; NOB16B16-NEXT: lsl z3.s, z3.s, #16 -; NOB16B16-NEXT: lsl z4.s, z4.s, #16 -; NOB16B16-NEXT: lsl z5.s, z5.s, #16 -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 -; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s -; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv8bf16: diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index 5c58eab391972..16e8feb0dc5bb 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -1,79 +1,175 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=SVE +; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16 target triple = "aarch64-unknown-linux-gnu" define @fmla_nxv8bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z4.s, z2.h +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p0/m, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmla_nxv4bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmla_nxv2bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmls_nxv8bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z5.s, z2.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p0/m, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmls_nxv4bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmls_nxv2bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p0/m, z1.h +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmla_sel_nxv8bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: uunpkhi z5.s, z2.h +; SVE-NEXT: uunpkhi z6.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h +; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s +; SVE-NEXT: bfcvt z2.h, p1/m, z4.s +; SVE-NEXT: uzp1 z1.h, z2.h, z1.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -81,10 +177,17 @@ define @fmla_sel_nxv8bf16( %pred, @fmla_sel_nxv4bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z3.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -92,10 +195,20 @@ define @fmla_sel_nxv4bf16( %pred, @fmla_sel_nxv2bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -103,10 +216,31 @@ define @fmla_sel_nxv2bf16( %pred, @fmls_sel_nxv8bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: uunpkhi z6.s, z2.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h +; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s +; SVE-NEXT: bfcvt z2.h, p1/m, z4.s +; SVE-NEXT: uzp1 z1.h, z2.h, z1.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -114,10 +248,19 @@ define @fmls_sel_nxv8bf16( %pred, @fmls_sel_nxv4bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z3.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -125,10 +268,21 @@ define @fmls_sel_nxv4bf16( %pred, @fmls_sel_nxv2bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -136,33 +290,90 @@ define @fmls_sel_nxv2bf16( %pred, @fadd_sel_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fadd_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select %mask, %b, zeroinitializer %fadd = fadd nsz %a, %sel ret %fadd } define @fsub_sel_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fsub_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select %mask, %b, zeroinitializer %fsub = fsub %a, %sel ret %fsub } define @fadd_sel_negzero_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fadd_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg zeroinitializer %sel = select %mask, %b, %nz %fadd = fadd %a, %sel @@ -170,11 +381,30 @@ define @fadd_sel_negzero_nxv8bf16( %a } define @fsub_sel_negzero_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fsub_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg zeroinitializer %sel = select %mask, %b, %nz %fsub = fsub nsz %a, %sel @@ -182,13 +412,46 @@ define @fsub_sel_negzero_nxv8bf16( %a } define @fadd_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfadd z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: movi v3.2d, #0000000000000000 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: movi v3.2d, #0000000000000000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fadd = fadd contract %a, %sel @@ -196,12 +459,41 @@ define @fadd_sel_fmul_nxv8bf16( %a, < } define @fsub_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fsub = fsub contract %a, %sel @@ -209,12 +501,41 @@ define @fsub_sel_fmul_nxv8bf16( %a, < } define @fadd_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fadd = fadd nsz contract %a, %sel @@ -222,12 +543,41 @@ define @fadd_sel_fmul_nsz_nxv8bf16( % } define @fsub_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fsub = fsub nsz contract %a, %sel @@ -235,12 +585,41 @@ define @fsub_sel_fmul_nsz_nxv8bf16( % } define @fadd_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -249,15 +628,50 @@ define @fadd_sel_fmul_negzero_nxv8bf16( @fsub_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: fmov h3, w8 -; CHECK-NEXT: mov z3.h, h3 -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfsub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: mov w8, #32768 // =0x8000 +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: fmov h3, w8 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: mov z3.h, h3 +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: mov w8, #32768 // =0x8000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: fmov h3, w8 +; SVE-B16B16-NEXT: mov z3.h, h3 +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -266,12 +680,41 @@ define @fsub_sel_fmul_negzero_nxv8bf16( @fadd_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -280,12 +723,41 @@ define @fadd_sel_fmul_negzero_nsz_nxv8bf16( @fsub_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz