-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+960
−87
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This can avoid the promotion bf16 -> f32 -> bf16 round trip.
Member
|
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions). Patch is 39.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169329.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..df024f571a4d1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1783,9 +1783,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+ }
- if (Subtarget->hasSVEB16B16() &&
- Subtarget->isNonStreamingSVEorSME2Available()) {
+ if (Subtarget->hasSVEB16B16() &&
+ Subtarget->isNonStreamingSVEorSME2Available()) {
+ // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
+ for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
+ MVT::nxv8bf16}) {
setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);
setOperationAction(ISD::FMAXIMUM, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
new file mode 100644
index 0000000000000..e6344b9eb89dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
@@ -0,0 +1,936 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <4 x bfloat> @fabs_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fabs_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.4h, #128, lsl #8
+; CHECK-NEXT: ret
+ %res = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %a)
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fabs_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fabs_v8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bic v0.8h, #128, lsl #8
+; CHECK-NEXT: ret
+ %res = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
+ ret <8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <4 x bfloat> @fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT: fadd v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT: bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_v4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl4
+; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT: ret
+ %res = fadd <4 x bfloat> %a, %b
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT: shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT: shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT: fadd v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT: fadd v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT: bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT: bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fadd_v8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl8
+; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT: bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT: ret
+ %res = fadd <8 x bfloat> %a, %b
+ ret <8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <4 x bfloat> @fdiv_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: bfcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %res = fdiv <4 x bfloat> %a, %b
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fdiv_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shll v2.4s, v1.4h, #16
+; CHECK-NEXT: shll v3.4s, v0.4h, #16
+; CHECK-NEXT: shll2 v1.4s, v1.8h, #16
+; CHECK-NEXT: shll2 v0.4s, v0.8h, #16
+; CHECK-NEXT: fdiv v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: fdiv v1.4s, v0.4s, v1.4s
+; CHECK-NEXT: bfcvtn v0.4h, v2.4s
+; CHECK-NEXT: bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+ %res = fdiv <8 x bfloat> %a, %b
+ ret <8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <4 x bfloat> @fmax_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: mov h4, v1.h[2]
+; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h1, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmax s2, s3, s2
+; NOB16B16-NEXT: fmax s3, s6, s5
+; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT: mov h6, v0.h[3]
+; NOB16B16-NEXT: fmax s4, s5, s4
+; NOB16B16-NEXT: bfcvt h2, s2
+; NOB16B16-NEXT: bfcvt h0, s3
+; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmax s1, s3, s1
+; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT: bfcvt h1, s1
+; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_v4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl4
+; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmax_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h6, v1.h[2]
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h16, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: fmax s4, s5, s4
+; NOB16B16-NEXT: mov h5, v0.h[3]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: fmax s3, s3, s2
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmax s4, s7, s6
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT: mov h7, v1.h[4]
+; NOB16B16-NEXT: mov h16, v0.h[4]
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: fmax s5, s5, s6
+; NOB16B16-NEXT: bfcvt h4, s4
+; NOB16B16-NEXT: mov h6, v0.h[5]
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT: mov h3, v1.h[5]
+; NOB16B16-NEXT: bfcvt h5, s5
+; NOB16B16-NEXT: fmax s7, s16, s7
+; NOB16B16-NEXT: mov h16, v0.h[6]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: mov h0, v0.h[7]
+; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT: mov h4, v1.h[6]
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: mov h1, v1.h[7]
+; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT: fmax s3, s6, s3
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT: bfcvt h5, s7
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmax s4, s6, s4
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT: fmax s0, s0, s1
+; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT: bfcvt h3, s4
+; NOB16B16-NEXT: bfcvt h0, s0
+; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT: mov v0.16b, v2.16b
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmax_v8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl8
+; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <4 x bfloat> @fmaxnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: mov h4, v1.h[2]
+; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h1, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmaxnm s2, s3, s2
+; NOB16B16-NEXT: fmaxnm s3, s6, s5
+; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT: mov h6, v0.h[3]
+; NOB16B16-NEXT: fmaxnm s4, s5, s4
+; NOB16B16-NEXT: bfcvt h2, s2
+; NOB16B16-NEXT: bfcvt h0, s3
+; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmaxnm s1, s3, s1
+; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT: bfcvt h1, s1
+; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_v4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl4
+; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmaxnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h6, v1.h[2]
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h16, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: fmaxnm s4, s5, s4
+; NOB16B16-NEXT: mov h5, v0.h[3]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: fmaxnm s3, s3, s2
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmaxnm s4, s7, s6
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT: mov h7, v1.h[4]
+; NOB16B16-NEXT: mov h16, v0.h[4]
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: fmaxnm s5, s5, s6
+; NOB16B16-NEXT: bfcvt h4, s4
+; NOB16B16-NEXT: mov h6, v0.h[5]
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT: mov h3, v1.h[5]
+; NOB16B16-NEXT: bfcvt h5, s5
+; NOB16B16-NEXT: fmaxnm s7, s16, s7
+; NOB16B16-NEXT: mov h16, v0.h[6]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: mov h0, v0.h[7]
+; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT: mov h4, v1.h[6]
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: mov h1, v1.h[7]
+; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT: fmaxnm s3, s6, s3
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT: bfcvt h5, s7
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmaxnm s4, s6, s4
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT: fmaxnm s0, s0, s1
+; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT: bfcvt h3, s4
+; NOB16B16-NEXT: bfcvt h0, s0
+; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT: mov v0.16b, v2.16b
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmaxnm_v8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl8
+; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <4 x bfloat> @fmin_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: mov h4, v1.h[2]
+; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h1, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmin s2, s3, s2
+; NOB16B16-NEXT: fmin s3, s6, s5
+; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT: mov h6, v0.h[3]
+; NOB16B16-NEXT: fmin s4, s5, s4
+; NOB16B16-NEXT: bfcvt h2, s2
+; NOB16B16-NEXT: bfcvt h0, s3
+; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmin s1, s3, s1
+; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT: bfcvt h1, s1
+; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_v4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl4
+; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmin_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h6, v1.h[2]
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h16, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: fmin s4, s5, s4
+; NOB16B16-NEXT: mov h5, v0.h[3]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: fmin s3, s3, s2
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fmin s4, s7, s6
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT: mov h7, v1.h[4]
+; NOB16B16-NEXT: mov h16, v0.h[4]
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: fmin s5, s5, s6
+; NOB16B16-NEXT: bfcvt h4, s4
+; NOB16B16-NEXT: mov h6, v0.h[5]
+; NOB16B16-NEXT: shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT: shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT: mov h3, v1.h[5]
+; NOB16B16-NEXT: bfcvt h5, s5
+; NOB16B16-NEXT: fmin s7, s16, s7
+; NOB16B16-NEXT: mov h16, v0.h[6]
+; NOB16B16-NEXT: shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT: mov h0, v0.h[7]
+; NOB16B16-NEXT: mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT: mov h4, v1.h[6]
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: mov h1, v1.h[7]
+; NOB16B16-NEXT: shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT: fmin s3, s6, s3
+; NOB16B16-NEXT: shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT: mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT: bfcvt h5, s7
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fmin s4, s6, s4
+; NOB16B16-NEXT: bfcvt h3, s3
+; NOB16B16-NEXT: mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT: fmin s0, s0, s1
+; NOB16B16-NEXT: mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT: bfcvt h3, s4
+; NOB16B16-NEXT: bfcvt h0, s0
+; NOB16B16-NEXT: mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT: mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT: mov v0.16b, v2.16b
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fmin_v8bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl8
+; B16B16-NEXT: // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT: // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <4 x bfloat> @fminnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v4bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: mov h4, v1.h[2]
+; NOB16B16-NEXT: shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT: shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT: mov h7, v0.h[2]
+; NOB16B16-NEXT: mov h1, v1.h[3]
+; NOB16B16-NEXT: shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT: shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT: shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT: shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT: fminnm s2, s3, s2
+; NOB16B16-NEXT: fminnm s3, s6, s5
+; NOB16B16-NEXT: shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT: mov h6, v0.h[3]
+; NOB16B16-NEXT: fminnm s4, s5, s4
+; NOB16B16-NEXT: bfcvt h2, s2
+; NOB16B16-NEXT: bfcvt h0, s3
+; NOB16B16-NEXT: shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT: mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT: bfcvt h2, s4
+; NOB16B16-NEXT: fminnm s1, s3, s1
+; NOB16B16-NEXT: mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT: bfcvt h1, s1
+; NOB16B16-NEXT: mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT: ret
+;
+; B16B16-LABEL: fminnm_v4bf16:
+; B16B16: // %bb.0:
+; B16B16-NEXT: ptrue p0.h, vl4
+; B16B16-NEXT: // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT: // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT: // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT: ret
+ %res = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fminnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v8bf16:
+; NOB16B16: // %bb.0:
+; NOB16B16-NEXT: mov h2, v1.h[1]
+; NOB16B16-NEXT: mov h3, v0.h[1]
+; NOB16B16-NEXT: shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:...
[truncated]
|
paulwalker-arm
approved these changes
Nov 26, 2025
sdesmalen-arm
approved these changes
Nov 27, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).