-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LLVM][CodeGen][SVE] Use BFMLALB for promoted bfloat fma operations. #167340
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
NOTE: From what I can see LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins and so I believe the compiler can (and does) work under the assumption the feature is not enabled.
|
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesWe're likely to get better code from custom legalisation, where we can remove unpack instructions (plus SVE2p1 has BFMLSLB/T), but we get much of benefit with these two small changes. NOTE: LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins, so the compiler works under the assumption the feature is not enabled. Patch is also more aggressive when enabling bfloat fma construction because it removes unnecessary rounding which is generally preferable regardless of whether BFMLALB is used or not. Patch is 38.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167340.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..da6c65f2c1c7d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18570,7 +18570,7 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
case MVT::f64:
return true;
case MVT::bf16:
- return VT.isScalableVector() && Subtarget->hasSVEB16B16() &&
+ return VT.isScalableVector() && Subtarget->hasBF16() &&
Subtarget->isNonStreamingSVEorSME2Available();
default:
break;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..ce6de5c780cf3 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2578,6 +2578,10 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
+ def : Pat<(nxv4f32 (AArch64fmla_p (SVEAllActive), nxv4f32:$acc, (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zn, (undef))),
+ (nxv4f32 (AArch64fcvte_mt (SVEAllActive), nxv4bf16:$Zm, (undef))))),
+ (BFMLALB_ZZZ nxv4f32:$acc, ZPR:$Zn, ZPR:$Zm)>;
+
defm BFCVT_ZPmZ : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
} // End HasBF16, HasSVE_or_SME
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
index 0580f5e0b019a..582e8456c05b3 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll
@@ -466,12 +466,10 @@ define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, <vscale x 4 x bfloat> %c) {
; NOB16B16-LABEL: fmla_nxv4bf16:
; NOB16B16: // %bb.0:
-; NOB16B16-NEXT: lsl z1.s, z1.s, #16
-; NOB16B16-NEXT: lsl z0.s, z0.s, #16
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
; NOB16B16-NEXT: ptrue p0.s
-; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
+; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s
; NOB16B16-NEXT: ret
;
; B16B16-LABEL: fmla_nxv4bf16:
@@ -486,24 +484,20 @@ define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) {
; NOB16B16-LABEL: fmla_nxv8bf16:
; NOB16B16: // %bb.0:
-; NOB16B16-NEXT: uunpkhi z3.s, z1.h
-; NOB16B16-NEXT: uunpkhi z4.s, z0.h
-; NOB16B16-NEXT: uunpkhi z5.s, z2.h
+; NOB16B16-NEXT: uunpkhi z3.s, z2.h
+; NOB16B16-NEXT: uunpklo z2.s, z2.h
+; NOB16B16-NEXT: uunpkhi z4.s, z1.h
+; NOB16B16-NEXT: uunpkhi z5.s, z0.h
; NOB16B16-NEXT: uunpklo z1.s, z1.h
; NOB16B16-NEXT: uunpklo z0.s, z0.h
-; NOB16B16-NEXT: uunpklo z2.s, z2.h
; NOB16B16-NEXT: ptrue p0.s
; NOB16B16-NEXT: lsl z3.s, z3.s, #16
-; NOB16B16-NEXT: lsl z4.s, z4.s, #16
-; NOB16B16-NEXT: lsl z5.s, z5.s, #16
-; NOB16B16-NEXT: lsl z1.s, z1.s, #16
-; NOB16B16-NEXT: lsl z0.s, z0.s, #16
; NOB16B16-NEXT: lsl z2.s, z2.s, #16
-; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s
-; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s
-; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s
-; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s
-; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h
+; NOB16B16-NEXT: bfmlalb z3.s, z5.h, z4.h
+; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h
+; NOB16B16-NEXT: bfcvt z0.h, p0/m, z3.s
+; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s
+; NOB16B16-NEXT: uzp1 z0.h, z1.h, z0.h
; NOB16B16-NEXT: ret
;
; B16B16-LABEL: fmla_nxv8bf16:
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
index 5c58eab391972..16e8feb0dc5bb 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll
@@ -1,79 +1,175 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=SVE
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=SVE-B16B16
target triple = "aarch64-unknown-linux-gnu"
define <vscale x 8 x bfloat> @fmla_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z4.s, z2.h
+; SVE-NEXT: uunpkhi z5.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: bfmlalb z3.s, z5.h, z4.h
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.h
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%res = fadd contract <vscale x 8 x bfloat> %acc, %mul
ret <vscale x 8 x bfloat> %res
}
define <vscale x 4 x bfloat> @fmla_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.s
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%res = fadd contract <vscale x 4 x bfloat> %acc, %mul
ret <vscale x 4 x bfloat> %res
}
define <vscale x 2 x bfloat> @fmla_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.d
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%res = fadd contract <vscale x 2 x bfloat> %acc, %mul
ret <vscale x 2 x bfloat> %res
}
define <vscale x 8 x bfloat> @fmls_nxv8bf16(<vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z0.s, z0.h
+; SVE-NEXT: uunpkhi z5.s, z2.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: uunpkhi z4.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: bfmlalb z3.s, z4.h, z5.h
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p0/m, z3.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: uzp1 z0.h, z0.h, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.h
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%res = fsub contract <vscale x 8 x bfloat> %acc, %mul
ret <vscale x 8 x bfloat> %res
}
define <vscale x 4 x bfloat> @fmls_nxv4bf16(<vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
+; SVE-NEXT: bfmlalb z0.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.s
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%res = fsub contract <vscale x 4 x bfloat> %acc, %mul
ret <vscale x 4 x bfloat> %res
}
define <vscale x 2 x bfloat> @fmls_nxv2bf16(<vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p0/m, z1.h
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmla z0.s, p0/m, z1.s, z2.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z0.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: ptrue p0.d
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%res = fsub contract <vscale x 2 x bfloat> %acc, %mul
ret <vscale x 2 x bfloat> %res
}
define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: uunpkhi z5.s, z2.h
+; SVE-NEXT: uunpkhi z6.s, z1.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: bfmlalb z3.s, z6.h, z5.h
+; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%add = fadd contract <vscale x 8 x bfloat> %acc, %mul
%res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %add, <vscale x 8 x bfloat> %acc
@@ -81,10 +177,17 @@ define <vscale x 8 x bfloat> @fmla_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
}
define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%add = fadd contract <vscale x 4 x bfloat> %acc, %mul
%res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %add, <vscale x 4 x bfloat> %acc
@@ -92,10 +195,20 @@ define <vscale x 4 x bfloat> @fmla_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
}
define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmla_sel_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmla_sel_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: ptrue p1.d
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmla_sel_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%add = fadd contract <vscale x 2 x bfloat> %acc, %mul
%res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %add, <vscale x 2 x bfloat> %acc
@@ -103,10 +216,31 @@ define <vscale x 2 x bfloat> @fmla_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
}
define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %acc, <vscale x 8 x bfloat> %m1, <vscale x 8 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: uunpkhi z6.s, z2.h
+; SVE-NEXT: uunpklo z2.s, z2.h
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: uunpkhi z5.s, z1.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: bfmlalb z3.s, z5.h, z6.h
+; SVE-NEXT: bfmlalb z4.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z1.h, p1/m, z3.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z4.s
+; SVE-NEXT: uzp1 z1.h, z2.h, z1.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 8 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 8 x bfloat> %acc, %mul
%res = select <vscale x 8 x i1> %pred, <vscale x 8 x bfloat> %sub, <vscale x 8 x bfloat> %acc
@@ -114,10 +248,19 @@ define <vscale x 8 x bfloat> @fmls_sel_nxv8bf16(<vscale x 8 x i1> %pred, <vscale
}
define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %acc, <vscale x 4 x bfloat> %m1, <vscale x 4 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv4bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv4bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
+; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h
+; SVE-NEXT: bfcvt z0.h, p0/m, z3.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv4bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 4 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 4 x bfloat> %acc, %mul
%res = select <vscale x 4 x i1> %pred, <vscale x 4 x bfloat> %sub, <vscale x 4 x bfloat> %acc
@@ -125,10 +268,21 @@ define <vscale x 4 x bfloat> @fmls_sel_nxv4bf16(<vscale x 4 x i1> %pred, <vscale
}
define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %acc, <vscale x 2 x bfloat> %m1, <vscale x 2 x bfloat> %m2) {
-; CHECK-LABEL: fmls_sel_nxv2bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fmls_sel_nxv2bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: ptrue p1.d
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z0.s, #16
+; SVE-NEXT: fneg z1.h, p1/m, z1.h
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: fmad z1.s, p1/m, z2.s, z3.s
+; SVE-NEXT: bfcvt z0.h, p0/m, z1.s
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fmls_sel_nxv2bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfmls z0.h, p0/m, z1.h, z2.h
+; SVE-B16B16-NEXT: ret
%mul = fmul contract <vscale x 2 x bfloat> %m1, %m2
%sub = fsub contract <vscale x 2 x bfloat> %acc, %mul
%res = select <vscale x 2 x i1> %pred, <vscale x 2 x bfloat> %sub, <vscale x 2 x bfloat> %acc
@@ -136,33 +290,90 @@ define <vscale x 2 x bfloat> @fmls_sel_nxv2bf16(<vscale x 2 x i1> %pred, <vscale
}
define <vscale x 8 x bfloat> @fadd_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fadd_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfadd z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fadd_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fadd z2.s, z3.s, z2.s
+; SVE-NEXT: fadd z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fadd_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfadd z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
%fadd = fadd nsz <vscale x 8 x bfloat> %a, %sel
ret <vscale x 8 x bfloat> %fadd
}
define <vscale x 8 x bfloat> @fsub_sel_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
-; CHECK-LABEL: fsub_sel_nxv8bf16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: bfsub z1.h, z0.h, z1.h
-; CHECK-NEXT: mov z0.h, p0/m, z1.h
-; CHECK-NEXT: ret
+; SVE-LABEL: fsub_sel_nxv8bf16:
+; SVE: // %bb.0:
+; SVE-NEXT: uunpkhi z2.s, z1.h
+; SVE-NEXT: uunpkhi z3.s, z0.h
+; SVE-NEXT: uunpklo z1.s, z1.h
+; SVE-NEXT: uunpklo z4.s, z0.h
+; SVE-NEXT: ptrue p1.s
+; SVE-NEXT: lsl z2.s, z2.s, #16
+; SVE-NEXT: lsl z3.s, z3.s, #16
+; SVE-NEXT: lsl z1.s, z1.s, #16
+; SVE-NEXT: lsl z4.s, z4.s, #16
+; SVE-NEXT: fsub z2.s, z3.s, z2.s
+; SVE-NEXT: fsub z1.s, z4.s, z1.s
+; SVE-NEXT: bfcvt z2.h, p1/m, z2.s
+; SVE-NEXT: bfcvt z1.h, p1/m, z1.s
+; SVE-NEXT: uzp1 z1.h, z1.h, z2.h
+; SVE-NEXT: mov z0.h, p0/m, z1.h
+; SVE-NEXT: ret
+;
+; SVE-B16B16-LABEL: fsub_sel_nxv8bf16:
+; SVE-B16B16: // %bb.0:
+; SVE-B16B16-NEXT: bfsub z1.h, z0.h, z1.h
+; SVE-B16B16-NEXT: mov z0.h, p0/m, z1.h
+; SVE-B16B16-NEXT: ret
%sel = select <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> zeroinitializer
%fsub = fsub <vscale x 8 x bfloat> %a, %sel
ret <vscale x 8 x bfloat> %fsub
}
define <vscale x 8 x bfloat> @fadd_sel_negzero_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %mask) {
...
[truncated]
|
MacDue
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
…lvm#167340) We're likely to get better code from custom legalisation, where we can remove unpack instructions (plus SVE2p1 has BFMLSLB/T), but we get much of benefit with these two small changes. NOTE: LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins, so the compiler works under the assumption the feature is not enabled. Patch is also more aggressive when enabling bfloat fma construction because it removes unnecessary rounding which is generally preferable regardless of whether BFMLALB is used or not.
We're likely to get better code from custom legalisation, where we can remove unpack instructions (plus SVE2p1 has BFMLSLB/T), but we get much of benefit with these two small changes.
NOTE: LLVM has no support for FEAT_AFP in terms of feature detection or ACLE builtins, so the compiler works under the assumption the feature is not enabled.
Patch is also more aggressive when enabling bfloat fma construction because it removes unnecessary rounding which is generally preferable regardless of whether BFMLALB is used or not.