Skip to content

Conversation

@MacDue
Copy link
Member

@MacDue MacDue commented Nov 24, 2025

This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).

This can avoid the promotion bf16 -> f32 -> bf16 round trip.
@llvmbot
Copy link
Member

llvmbot commented Nov 24, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

Changes

This can avoid the promotion bf16 -> f32 -> bf16 round trip (or costly expansions).


Patch is 39.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169329.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+6-2)
  • (added) llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll (+936)
  • (modified) llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll (+18-85)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e91f5a877b35b..df024f571a4d1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1783,9 +1783,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
+    }
 
-      if (Subtarget->hasSVEB16B16() &&
-          Subtarget->isNonStreamingSVEorSME2Available()) {
+    if (Subtarget->hasSVEB16B16() &&
+        Subtarget->isNonStreamingSVEorSME2Available()) {
+      // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
+      for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
+                      MVT::nxv8bf16}) {
         setOperationAction(ISD::FADD, VT, Custom);
         setOperationAction(ISD::FMA, VT, Custom);
         setOperationAction(ISD::FMAXIMUM, VT, Custom);
diff --git a/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
new file mode 100644
index 0000000000000..e6344b9eb89dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed-length-bf16-arith.ll
@@ -0,0 +1,936 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve,+bf16             < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16
+; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; FABS
+;
+
+define <4 x bfloat> @fabs_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: fabs_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.4h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat> %a)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fabs_v8bf16(<8 x bfloat> %a) {
+; CHECK-LABEL: fabs_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v0.8h, #128, lsl #8
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
+  ret <8 x bfloat> %res
+}
+
+;
+; FADD
+;
+
+define <4 x bfloat> @fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v0.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fadd_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    shll v2.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v0.4h, #16
+; NOB16B16-NEXT:    shll2 v1.4s, v1.8h, #16
+; NOB16B16-NEXT:    shll2 v0.4s, v0.8h, #16
+; NOB16B16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; NOB16B16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; NOB16B16-NEXT:    bfcvtn v0.4h, v2.4s
+; NOB16B16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fadd_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfadd z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FDIV
+;
+
+define <4 x bfloat> @fdiv_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <4 x bfloat> %a, %b
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fdiv_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: fdiv_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-NEXT:    ret
+  %res = fdiv <8 x bfloat> %a, %b
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAX
+;
+
+define <4 x bfloat> @fmax_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s2, s3, s2
+; NOB16B16-NEXT:    fmax s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmax_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmax_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmax s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmax s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmax s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmax s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmax s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmax s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmax s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmax s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmax_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmax z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maximum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMAXNM
+;
+
+define <4 x bfloat> @fmaxnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s2, s3, s2
+; NOB16B16-NEXT:    fmaxnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmaxnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmaxnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmaxnm s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmaxnm s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmaxnm s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmaxnm s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmaxnm s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmaxnm s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmaxnm_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmaxnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMIN
+;
+
+define <4 x bfloat> @fmin_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s2, s3, s2
+; NOB16B16-NEXT:    fmin s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minimum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fmin_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fmin_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h6, v1.h[2]
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h16, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    fmin s4, s5, s4
+; NOB16B16-NEXT:    mov h5, v0.h[3]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    fmin s3, s3, s2
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fmin s4, s7, s6
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    shll v5.4s, v5.4h, #16
+; NOB16B16-NEXT:    mov h7, v1.h[4]
+; NOB16B16-NEXT:    mov h16, v0.h[4]
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    fmin s5, s5, s6
+; NOB16B16-NEXT:    bfcvt h4, s4
+; NOB16B16-NEXT:    mov h6, v0.h[5]
+; NOB16B16-NEXT:    shll v7.4s, v7.4h, #16
+; NOB16B16-NEXT:    shll v16.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[1], v3.h[0]
+; NOB16B16-NEXT:    mov h3, v1.h[5]
+; NOB16B16-NEXT:    bfcvt h5, s5
+; NOB16B16-NEXT:    fmin s7, s16, s7
+; NOB16B16-NEXT:    mov h16, v0.h[6]
+; NOB16B16-NEXT:    shll v6.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov h0, v0.h[7]
+; NOB16B16-NEXT:    mov v2.h[2], v4.h[0]
+; NOB16B16-NEXT:    mov h4, v1.h[6]
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    mov h1, v1.h[7]
+; NOB16B16-NEXT:    shll v0.4s, v0.4h, #16
+; NOB16B16-NEXT:    fmin s3, s6, s3
+; NOB16B16-NEXT:    shll v6.4s, v16.4h, #16
+; NOB16B16-NEXT:    mov v2.h[3], v5.h[0]
+; NOB16B16-NEXT:    bfcvt h5, s7
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fmin s4, s6, s4
+; NOB16B16-NEXT:    bfcvt h3, s3
+; NOB16B16-NEXT:    mov v2.h[4], v5.h[0]
+; NOB16B16-NEXT:    fmin s0, s0, s1
+; NOB16B16-NEXT:    mov v2.h[5], v3.h[0]
+; NOB16B16-NEXT:    bfcvt h3, s4
+; NOB16B16-NEXT:    bfcvt h0, s0
+; NOB16B16-NEXT:    mov v2.h[6], v3.h[0]
+; NOB16B16-NEXT:    mov v2.h[7], v0.h[0]
+; NOB16B16-NEXT:    mov v0.16b, v2.16b
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fmin_v8bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl8
+; B16B16-NEXT:    // kill: def $q0 killed $q0 def $z0
+; B16B16-NEXT:    // kill: def $q1 killed $q1 def $z1
+; B16B16-NEXT:    bfmin z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.minimum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %res
+}
+
+;
+; FMINNM
+;
+
+define <4 x bfloat> @fminnm_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v4bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    // kill: def $d1 killed $d1 def $q1
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    mov h4, v1.h[2]
+; NOB16B16-NEXT:    shll v5.4s, v1.4h, #16
+; NOB16B16-NEXT:    shll v6.4s, v0.4h, #16
+; NOB16B16-NEXT:    mov h7, v0.h[2]
+; NOB16B16-NEXT:    mov h1, v1.h[3]
+; NOB16B16-NEXT:    shll v2.4s, v2.4h, #16
+; NOB16B16-NEXT:    shll v3.4s, v3.4h, #16
+; NOB16B16-NEXT:    shll v4.4s, v4.4h, #16
+; NOB16B16-NEXT:    shll v1.4s, v1.4h, #16
+; NOB16B16-NEXT:    fminnm s2, s3, s2
+; NOB16B16-NEXT:    fminnm s3, s6, s5
+; NOB16B16-NEXT:    shll v5.4s, v7.4h, #16
+; NOB16B16-NEXT:    mov h6, v0.h[3]
+; NOB16B16-NEXT:    fminnm s4, s5, s4
+; NOB16B16-NEXT:    bfcvt h2, s2
+; NOB16B16-NEXT:    bfcvt h0, s3
+; NOB16B16-NEXT:    shll v3.4s, v6.4h, #16
+; NOB16B16-NEXT:    mov v0.h[1], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h2, s4
+; NOB16B16-NEXT:    fminnm s1, s3, s1
+; NOB16B16-NEXT:    mov v0.h[2], v2.h[0]
+; NOB16B16-NEXT:    bfcvt h1, s1
+; NOB16B16-NEXT:    mov v0.h[3], v1.h[0]
+; NOB16B16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOB16B16-NEXT:    ret
+;
+; B16B16-LABEL: fminnm_v4bf16:
+; B16B16:       // %bb.0:
+; B16B16-NEXT:    ptrue p0.h, vl4
+; B16B16-NEXT:    // kill: def $d0 killed $d0 def $z0
+; B16B16-NEXT:    // kill: def $d1 killed $d1 def $z1
+; B16B16-NEXT:    bfminnm z0.h, p0/m, z0.h, z1.h
+; B16B16-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; B16B16-NEXT:    ret
+  %res = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %res
+}
+
+define <8 x bfloat> @fminnm_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; NOB16B16-LABEL: fminnm_v8bf16:
+; NOB16B16:       // %bb.0:
+; NOB16B16-NEXT:    mov h2, v1.h[1]
+; NOB16B16-NEXT:    mov h3, v0.h[1]
+; NOB16B16-NEXT:    shll v4.4s, v1.4h, #16
+; NOB16B16-NEXT:...
[truncated]

@MacDue MacDue merged commit ee45ba2 into llvm:main Nov 27, 2025
12 checks passed
@MacDue MacDue deleted the use_sve_bfloat branch November 27, 2025 19:18
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants