[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329

MacDue · 2025-11-24T14:06:21Z

This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).

This can avoid the promotion bf16 -> f32 -> bf16 round trip.

llvmbot · 2025-11-24T14:06:59Z

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).

Patch is 39.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169329.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+6-2)
(added) llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll (+936)
(modified) llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll (+18-85)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..df024f571a4d1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1783,9 +1783,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+    }
 
-      if (Subtarget->hasSVEB16B16() &&
-          Subtarget->isNonStreamingSVEorSME2Available()) {
+    if (Subtarget->hasSVEB16B16() &&
+        Subtarget->isNonStreamingSVEorSME2Available()) {
+      // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
+      for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
+                      MVT::nxv8bf16}) {
         setOperationAction(ISD::FADD, VT, Custom);
         setOperationAction(ISD::FMA, VT, Custom);
         setOperationAction(ISD::FMAXIMUM, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
new file mode 100644
index 0000000000000..e6344b9eb89dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
@@ -0,0 +1,936 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16             < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <4 x bfloat> @fabs_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fabs_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.4h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %a)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fabs_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fabs_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
+  ret <8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <4 x bfloat> @fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT:    shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT:    shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <4 x bfloat> @fdiv_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fdiv_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <4 x bfloat> @fmax_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s2, s3, s2
+; NOB16B16-NEXT:    fmax s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmax_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmax s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmax s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmax s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmax s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmax s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <4 x bfloat> @fmaxnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s2, s3, s2
+; NOB16B16-NEXT:    fmaxnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmaxnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmaxnm s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmaxnm s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmaxnm s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <4 x bfloat> @fmin_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s2, s3, s2
+; NOB16B16-NEXT:    fmin s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmin_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmin s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmin s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmin s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmin s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmin s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <4 x bfloat> @fminnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fminnm s2, s3, s2
+; NOB16B16-NEXT:    fminnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fminnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fminnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fminnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fminnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:...
[truncated]

[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16

32fe114

This can avoid the promotion bf16 -> f32 -> bf16 round trip.

MacDue requested review from SamTebbs33, huntergr-arm and paulwalker-arm November 24, 2025 14:06

llvmbot added the backend:AArch64 label Nov 24, 2025

paulwalker-arm approved these changes Nov 26, 2025

View reviewed changes

sdesmalen-arm approved these changes Nov 27, 2025

View reviewed changes

MacDue merged commit ee45ba2 into llvm:main Nov 27, 2025
12 checks passed

MacDue deleted the use_sve_bfloat branch November 27, 2025 19:18

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329

[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329

MacDue commented Nov 24, 2025

Uh oh!

llvmbot commented Nov 24, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329

[AArch64] Use SVE for fixed-length bf16 operations with +sve-b16b16 #169329

Conversation

MacDue commented Nov 24, 2025

Uh oh!

llvmbot commented Nov 24, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants