From d0e82f298e0ab203304f2d61308ae71009cb1b0f Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Mon, 10 Nov 2025 15:42:56 +0000 Subject: [PATCH 1/4] Update sve-bf16-combines.ll to show output without sve-b16b16 support. --- .../test/CodeGen/AArch64/sve-bf16-combines.ll | 791 +++++++++++++++--- 1 file changed, 664 insertions(+), 127 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index 5c58eab391972..1e3657ad703d9 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -1,79 +1,217 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=SVE +; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16 target triple = "aarch64-unknown-linux-gnu" define @fmla_nxv8bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p0/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p0/m, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmla_nxv4bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmla_nxv2bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmla_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fadd contract %acc, %mul ret %res } define @fmls_nxv8bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p0/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p0/m, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.h +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmls_nxv4bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.s +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmls_nxv2bf16( %acc, %m1, %m2) { -; CHECK-LABEL: fmls_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z0.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: ptrue p0.d +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %res = fsub contract %acc, %mul ret %res } define @fmla_sel_nxv8bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -81,10 +219,23 @@ define @fmla_sel_nxv8bf16( %pred, @fmla_sel_nxv4bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: lsl z2.s, z0.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z1.s, z2.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -92,10 +243,23 @@ define @fmla_sel_nxv4bf16( %pred, @fmla_sel_nxv2bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmla_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmla_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s +; SVE-NEXT: lsl z2.s, z0.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z1.s, p1/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmla_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %add = fadd contract %acc, %mul %res = select %pred, %add, %acc @@ -103,10 +267,39 @@ define @fmla_sel_nxv2bf16( %pred, @fmls_sel_nxv8bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -114,10 +307,23 @@ define @fmls_sel_nxv8bf16( %pred, @fmls_sel_nxv4bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: lsl z2.s, z0.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z1.s, z2.s, z1.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv4bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -125,10 +331,23 @@ define @fmls_sel_nxv4bf16( %pred, @fmls_sel_nxv2bf16( %pred, %acc, %m1, %m2) { -; CHECK-LABEL: fmls_sel_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h -; CHECK-NEXT: ret +; SVE-LABEL: fmls_sel_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: ptrue p1.d +; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s +; SVE-NEXT: lsl z2.s, z0.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsubr z1.s, p1/m, z1.s, z2.s +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fmls_sel_nxv2bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h +; SVE-B16B16-NEXT: ret %mul = fmul contract %m1, %m2 %sub = fsub contract %acc, %mul %res = select %pred, %sub, %acc @@ -136,33 +355,90 @@ define @fmls_sel_nxv2bf16( %pred, @fadd_sel_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fadd_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select %mask, %b, zeroinitializer %fadd = fadd nsz %a, %sel ret %fadd } define @fsub_sel_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fsub_sel_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %sel = select %mask, %b, zeroinitializer %fsub = fsub %a, %sel ret %fsub } define @fadd_sel_negzero_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fadd_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg zeroinitializer %sel = select %mask, %b, %nz %fadd = fadd %a, %sel @@ -170,11 +446,30 @@ define @fadd_sel_negzero_nxv8bf16( %a } define @fsub_sel_negzero_nxv8bf16( %a, %b, %mask) { -; CHECK-LABEL: fsub_sel_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %nz = fneg zeroinitializer %sel = select %mask, %b, %nz %fsub = fsub nsz %a, %sel @@ -182,13 +477,46 @@ define @fsub_sel_negzero_nxv8bf16( %a } define @fadd_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfadd z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: movi v3.2d, #0000000000000000 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: movi v3.2d, #0000000000000000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fadd = fadd contract %a, %sel @@ -196,12 +524,41 @@ define @fadd_sel_fmul_nxv8bf16( %a, < } define @fsub_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fsub = fsub contract %a, %sel @@ -209,12 +566,41 @@ define @fsub_sel_fmul_nxv8bf16( %a, < } define @fadd_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fadd = fadd nsz contract %a, %sel @@ -222,12 +608,41 @@ define @fadd_sel_fmul_nsz_nxv8bf16( % } define @fsub_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %sel = select %mask, %fmul, zeroinitializer %fsub = fsub nsz contract %a, %sel @@ -235,12 +650,41 @@ define @fsub_sel_fmul_nsz_nxv8bf16( % } define @fadd_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -249,15 +693,50 @@ define @fadd_sel_fmul_negzero_nxv8bf16( @fsub_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: fmov h3, w8 -; CHECK-NEXT: mov z3.h, h3 -; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: bfsub z0.h, z0.h, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: mov w8, #32768 // =0x8000 +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: fmov h3, w8 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: mov z3.h, h3 +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: bfcvt z1.h, p1/m, z2.s +; SVE-NEXT: bfcvt z0.h, p1/m, z0.s +; SVE-NEXT: uzp1 z0.h, z0.h, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: mov w8, #32768 // =0x8000 +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: fmov h3, w8 +; SVE-B16B16-NEXT: mov z3.h, h3 +; SVE-B16B16-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -266,12 +745,41 @@ define @fsub_sel_fmul_negzero_nxv8bf16( @fadd_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfadd z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fadd z2.s, z3.s, z2.s +; SVE-NEXT: fadd z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz @@ -280,12 +788,41 @@ define @fadd_sel_fmul_negzero_nsz_nxv8bf16( @fsub_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { -; CHECK-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: bfmul z1.h, z1.h, z2.h -; CHECK-NEXT: bfsub z1.h, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: uunpkhi z3.s, z2.h +; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: ptrue p1.s +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fmul z3.s, z4.s, z3.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: lsl z4.s, z4.s, #16 +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: lsl z1.s, z1.s, #16 +; SVE-NEXT: fsub z2.s, z3.s, z2.s +; SVE-NEXT: fsub z1.s, z4.s, z1.s +; SVE-NEXT: bfcvt z2.h, p1/m, z2.s +; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: mov z0.h, p0/m, z1.h +; SVE-NEXT: ret +; +; SVE-B16B16-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: +; SVE-B16B16: // %bb.0: +; SVE-B16B16-NEXT: bfmul z1.h, z1.h, z2.h +; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h +; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h +; SVE-B16B16-NEXT: ret %fmul = fmul %b, %c %nz = fneg zeroinitializer %sel = select %mask, %fmul, %nz From e0fd9b91c59682e6df886c71ae50968acafc3b21 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Mon, 10 Nov 2025 15:54:13 +0000 Subject: [PATCH 2/4] [LLVM][CodeGen][SVE] Enable BFloat fma contraction more aggressively. --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- .../test/CodeGen/AArch64/sve-bf16-combines.ll | 148 +++++++----------- 2 files changed, 55 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b26..da6c65f2c1c7d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( case MVT::f64: return true; case MVT::bf16: - return VT.isScalableVector() && Subtarget->hasSVEB16B16() && + return VT.isScalableVector() && Subtarget->hasBF16() && Subtarget->isNonStreamingSVEorSME2Available(); default: break; diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index 1e3657ad703d9..230bd9cf5420f 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -9,26 +9,20 @@ define @fmla_nxv8bf16( %acc, @fmla_nxv4bf16( %acc, @fmla_nxv2bf16( %acc, @fmla_nxv2bf16( %acc, @fmls_nxv8bf16( %acc, %m1, %m2) { ; SVE-LABEL: fmls_nxv8bf16: ; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h ; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpkhi z4.s, z0.h ; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: fneg z1.h, p0/m, z1.h ; SVE-NEXT: ptrue p0.s ; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z4.s, z4.s, #16 ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p0/m, z3.s -; SVE-NEXT: uunpkhi z3.s, z0.h -; SVE-NEXT: uunpklo z0.s, z0.h -; SVE-NEXT: bfcvt z1.h, p0/m, z1.s -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z5.s, z5.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsub z2.s, z3.s, z2.s -; SVE-NEXT: fsub z0.s, z0.s, z1.s -; SVE-NEXT: bfcvt z1.h, p0/m, z2.s +; SVE-NEXT: fmad z3.s, p0/m, z5.s, z4.s +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfcvt z1.h, p0/m, z3.s ; SVE-NEXT: bfcvt z0.h, p0/m, z0.s ; SVE-NEXT: uzp1 z0.h, z0.h, z1.h ; SVE-NEXT: ret @@ -133,14 +117,12 @@ define @fmls_nxv8bf16( %acc, @fmls_nxv4bf16( %acc, %m1, %m2) { ; SVE-LABEL: fmls_nxv4bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: ptrue p0.s +; SVE-NEXT: lsl z2.s, z2.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: fneg z1.h, p0/m, z1.h ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsub z0.s, z0.s, z1.s +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s ; SVE-NEXT: bfcvt z0.h, p0/m, z0.s ; SVE-NEXT: ret ; @@ -157,14 +139,12 @@ define @fmls_nxv4bf16( %acc, @fmls_nxv2bf16( %acc, %m1, %m2) { ; SVE-LABEL: fmls_nxv2bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: ptrue p0.d +; SVE-NEXT: lsl z2.s, z2.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 -; SVE-NEXT: fmul z1.s, p0/m, z1.s, z2.s -; SVE-NEXT: bfcvt z1.h, p0/m, z1.s +; SVE-NEXT: fneg z1.h, p0/m, z1.h ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s ; SVE-NEXT: bfcvt z0.h, p0/m, z0.s ; SVE-NEXT: ret ; @@ -183,26 +163,20 @@ define @fmla_sel_nxv8bf16( %pred, @fmla_sel_nxv4bf16( %pred, @fmla_sel_nxv2bf16( %pred, @fmla_sel_nxv2bf16( %pred, @fmls_sel_nxv8bf16( %pred, %acc, %m1, %m2) { ; SVE-LABEL: fmls_sel_nxv8bf16: ; SVE: // %bb.0: +; SVE-NEXT: ptrue p1.h ; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h +; SVE-NEXT: uunpkhi z4.s, z0.h ; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: uunpklo z6.s, z0.h +; SVE-NEXT: fneg z1.h, p1/m, z1.h ; SVE-NEXT: ptrue p1.s ; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z4.s, z4.s, #16 ; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z6.s, z6.s, #16 +; SVE-NEXT: uunpkhi z5.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: lsl z5.s, z5.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s +; SVE-NEXT: fmad z3.s, p1/m, z5.s, z4.s +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s ; SVE-NEXT: bfcvt z2.h, p1/m, z3.s -; SVE-NEXT: uunpkhi z3.s, z0.h -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsub z2.s, z3.s, z2.s -; SVE-NEXT: fsub z1.s, z4.s, z1.s -; SVE-NEXT: bfcvt z2.h, p1/m, z2.s ; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: uzp1 z1.h, z1.h, z2.h ; SVE-NEXT: mov z0.h, p0/m, z1.h @@ -309,14 +273,12 @@ define @fmls_sel_nxv8bf16( %pred, @fmls_sel_nxv4bf16( %pred, %acc, %m1, %m2) { ; SVE-LABEL: fmls_sel_nxv4bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: lsl z2.s, z0.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsub z1.s, z2.s, z1.s +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s ; SVE-NEXT: bfcvt z0.h, p0/m, z1.s ; SVE-NEXT: ret ; @@ -333,14 +295,12 @@ define @fmls_sel_nxv4bf16( %pred, @fmls_sel_nxv2bf16( %pred, %acc, %m1, %m2) { ; SVE-LABEL: fmls_sel_nxv2bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: ptrue p1.d -; SVE-NEXT: fmul z1.s, p1/m, z1.s, z2.s -; SVE-NEXT: lsl z2.s, z0.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s +; SVE-NEXT: lsl z2.s, z2.s, #16 +; SVE-NEXT: lsl z3.s, z0.s, #16 +; SVE-NEXT: fneg z1.h, p1/m, z1.h ; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fsubr z1.s, p1/m, z1.s, z2.s +; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s ; SVE-NEXT: bfcvt z0.h, p0/m, z1.s ; SVE-NEXT: ret ; From 9886945da72e51cc8a2b6203088e8cd76e8bea94 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Mon, 10 Nov 2025 15:29:29 +0000 Subject: [PATCH 3/4] [LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations. NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins and so I believe the compiler can (and does) work under the assumption the feature is not enabled. --- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 4 + llvm/test/CodeGen/AArch64/sve-bf16-arith.ll | 28 +++--- .../test/CodeGen/AArch64/sve-bf16-combines.ll | 95 +++++++------------ 3 files changed, 50 insertions(+), 77 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dcbca600..ce6de5c780cf3 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2578,6 +2578,10 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>; defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>; + def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))), + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))), + (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>; + defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>; defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>; } // End HasBF16, HasSVE_or_SME diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll index 0580f5e0b019a..582e8456c05b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll @@ -466,12 +466,10 @@ define @fmla_nxv2bf16( %a, @fmla_nxv4bf16( %a, %b, %c) { ; NOB16B16-LABEL: fmla_nxv4bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 ; NOB16B16-NEXT: ptrue p0.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv4bf16: @@ -486,24 +484,20 @@ define @fmla_nxv4bf16( %a, @fmla_nxv8bf16( %a, %b, %c) { ; NOB16B16-LABEL: fmla_nxv8bf16: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: uunpkhi z3.s, z1.h -; NOB16B16-NEXT: uunpkhi z4.s, z0.h -; NOB16B16-NEXT: uunpkhi z5.s, z2.h +; NOB16B16-NEXT: uunpkhi z3.s, z2.h +; NOB16B16-NEXT: uunpklo z2.s, z2.h +; NOB16B16-NEXT: uunpkhi z4.s, z1.h +; NOB16B16-NEXT: uunpkhi z5.s, z0.h ; NOB16B16-NEXT: uunpklo z1.s, z1.h ; NOB16B16-NEXT: uunpklo z0.s, z0.h -; NOB16B16-NEXT: uunpklo z2.s, z2.h ; NOB16B16-NEXT: ptrue p0.s ; NOB16B16-NEXT: lsl z3.s, z3.s, #16 -; NOB16B16-NEXT: lsl z4.s, z4.s, #16 -; NOB16B16-NEXT: lsl z5.s, z5.s, #16 -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 ; NOB16B16-NEXT: lsl z2.s, z2.s, #16 -; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s -; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s -; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s -; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h ; NOB16B16-NEXT: ret ; ; B16B16-LABEL: fmla_nxv8bf16: diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index 230bd9cf5420f..16e8feb0dc5bb 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -7,21 +7,17 @@ target triple = "aarch64-unknown-linux-gnu" define @fmla_nxv8bf16( %acc, %m1, %m2) { ; SVE-LABEL: fmla_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpkhi z5.s, z0.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z0.s, z0.h +; SVE-NEXT: uunpkhi z4.s, z2.h +; SVE-NEXT: uunpkhi z5.s, z1.h ; SVE-NEXT: uunpklo z2.s, z2.h ; SVE-NEXT: uunpklo z1.s, z1.h -; SVE-NEXT: uunpklo z0.s, z0.h ; SVE-NEXT: ptrue p0.s ; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z5.s, z5.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 -; SVE-NEXT: fmad z3.s, p0/m, z4.s, z5.s -; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h ; SVE-NEXT: bfcvt z1.h, p0/m, z3.s ; SVE-NEXT: bfcvt z0.h, p0/m, z0.s ; SVE-NEXT: uzp1 z0.h, z0.h, z1.h @@ -40,11 +36,9 @@ define @fmla_nxv8bf16( %acc, @fmla_nxv4bf16( %acc, %m1, %m2) { ; SVE-LABEL: fmla_nxv4bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 ; SVE-NEXT: ptrue p0.s -; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h ; SVE-NEXT: bfcvt z0.h, p0/m, z0.s ; SVE-NEXT: ret ; @@ -83,22 +77,18 @@ define @fmls_nxv8bf16( %acc, @fmls_nxv4bf16( %acc, @fmls_nxv2bf16( %acc, @fmla_sel_nxv8bf16( %pred, %acc, %m1, %m2) { ; SVE-LABEL: fmla_sel_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpkhi z5.s, z0.h +; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: uunpkhi z5.s, z2.h +; SVE-NEXT: uunpkhi z6.s, z1.h ; SVE-NEXT: uunpklo z2.s, z2.h ; SVE-NEXT: uunpklo z1.s, z1.h -; SVE-NEXT: uunpklo z6.s, z0.h ; SVE-NEXT: ptrue p1.s ; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z5.s, z5.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: lsl z6.s, z6.s, #16 -; SVE-NEXT: fmad z3.s, p1/m, z4.s, z5.s -; SVE-NEXT: fmad z1.s, p1/m, z2.s, z6.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s -; SVE-NEXT: uzp1 z1.h, z1.h, z2.h +; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h +; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s +; SVE-NEXT: bfcvt z2.h, p1/m, z4.s +; SVE-NEXT: uzp1 z1.h, z2.h, z1.h ; SVE-NEXT: mov z0.h, p0/m, z1.h ; SVE-NEXT: ret ; @@ -195,12 +179,9 @@ define @fmla_sel_nxv8bf16( %pred, @fmla_sel_nxv4bf16( %pred, %acc, %m1, %m2) { ; SVE-LABEL: fmla_sel_nxv4bf16: ; SVE: // %bb.0: -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: lsl z3.s, z0.s, #16 -; SVE-NEXT: ptrue p1.s -; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s -; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfcvt z0.h, p0/m, z3.s ; SVE-NEXT: ret ; ; SVE-B16B16-LABEL: fmla_sel_nxv4bf16: @@ -238,25 +219,21 @@ define @fmls_sel_nxv8bf16( %pred, @fmls_sel_nxv4bf16( %pred, Date: Mon, 10 Nov 2025 18:36:33 +0000 Subject: [PATCH 4/4] Reformat AArch64SVEInstrInfo.td changes. --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index ce6de5c780cf3..ece012a035bfc 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2578,8 +2578,9 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>; defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>; - def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))), - (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))), + def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))), + (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))), (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>; defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;