From 9004ff2f276a1c84043577a97721049343ae8bed Mon Sep 17 00:00:00 2001 From: Amichaxx Date: Mon, 27 Oct 2025 16:49:03 +0000 Subject: [PATCH 1/2] Implement widening FMMLA intrinsics - F16 to F32 - MF8 to F32 - MF8 to F16 --- clang/include/clang/Basic/arm_sve.td | 12 ++++++ .../sve-intrinsics/acle_sve_fmmla-f32f16.c | 33 +++++++++++++++ .../sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 35 ++++++++++++++++ .../sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 36 ++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +++++++ .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 6 ++- llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll | 32 +++++++++++++++ .../test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll | 39 ++++++++++++++++++ .../test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll | 41 +++++++++++++++++++ 9 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c create mode 100644 clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c create mode 100644 clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c create mode 100644 llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index d2b7b78b9970f..c63da3308d6a0 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in { def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">; +let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>; +} + +let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>; +} + +let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>; +} + def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">; def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">; diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c new file mode 100644 index 0000000000000..bebaa059e5c84 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f32f16( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) { + return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b); +} diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c new file mode 100644 index 0000000000000..a19ad0576bb4b --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f16mf8( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr); +} diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c new file mode 100644 index 0000000000000..526f2b1f45927 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c @@ -0,0 +1,36 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f32mf8( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b81edc385cd43..832f97fc95959 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; // def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic; +def int_aarch64_sve_fmmla_f16f32 + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ], + [IntrNoMem]>; + +def int_aarch64_sve_fmmla_mf8f32 + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], + [IntrNoMem]>; + +def int_aarch64_sve_fmmla_mf8f16 + : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty], + [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], + [IntrNoMem]>; // // SVE ACLE: 7.2. BFloat16 extensions // diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dcbca600..c756873d0bf7e 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in { } // End HasSVE, HasMatMulFP32 let Predicates = [HasSVE_F16F32MM] in { - def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>; + defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>; } // End HasSVE_F16F32MM let Predicates = [HasSVE, HasMatMulFP64] in { @@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_ let Predicates = [HasSVE2, HasF8F32MM] in { def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">; + def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>; } let Predicates = [HasSVE2, HasF8F16MM] in { def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">; + def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>; } let Predicates = [HasSSVE_FP8DOT2] in { diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll new file mode 100644 index 0000000000000..ea636d65a479c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK + +define @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_( %acc, %a, %b) { +; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: fmmla z0.s, z1.h, z2.h +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = call @llvm.aarch64.sve.fmmla.f16f32( %0, %1, %2) + ret %3 +} + +declare @llvm.aarch64.sve.fmmla.f16f32(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll new file mode 100644 index 0000000000000..0fdd6bf2508e3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK + +define @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 %fpmr) { +; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #3 +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: str x0, [x8, #8] +; CHECK-NEXT: msr FPMR, x0 +; CHECK-NEXT: fmmla z0.h, z1.b, z2.b +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + %fpmr.addr = alloca i64, align 8 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + store i64 %fpmr, ptr %fpmr.addr, align 8 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = load i64, ptr %fpmr.addr, align 8 + call void @llvm.aarch64.set.fpmr(i64 %3) + %4 = call @llvm.aarch64.sve.fmmla.mf8f16( %0, %1, %2) + ret %4 +} + +declare @llvm.aarch64.sve.fmmla.mf8f16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll new file mode 100644 index 0000000000000..007a164ac77da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK + +define dso_local @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 noundef %fpmr) #0 { +; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #3 +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: str x0, [x8, #8] +; CHECK-NEXT: msr FPMR, x0 +; CHECK-NEXT: fmmla z0.s, z1.b, z2.b +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + %fpmr.addr = alloca i64, align 8 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + store i64 %fpmr, ptr %fpmr.addr, align 8 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = load i64, ptr %fpmr.addr, align 8 + call void @llvm.aarch64.set.fpmr(i64 %3) + %4 = call @llvm.aarch64.sve.fmmla.mf8f32( %0, %1, %2) + ret %4 +} + +declare void @llvm.aarch64.set.fpmr(i64) + +declare @llvm.aarch64.sve.fmmla.mf8f32(, , ) From 4b3703398a6d3ebac4eb1a9e6b46022f61f06da1 Mon Sep 17 00:00:00 2001 From: Amichaxx Date: Wed, 12 Nov 2025 14:38:05 +0000 Subject: [PATCH 2/2] - Implemented overloading for fmmla intrinsics, replaced fixed-type intrinsics - Prototype cleanups - Updated ll tests to remove unnecessary IR - Removed unused arguments in clang test macros - Removed redundant check lines in ll tests --- clang/include/clang/Basic/arm_sve.td | 10 ++--- .../sve-intrinsics/acle_sve_fmmla-f32f16.c | 12 +++--- .../sve-intrinsics/acle_sve_matmul_fp32.c | 4 +- .../sve-intrinsics/acle_sve_matmul_fp64.c | 4 +- .../sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 10 ++--- .../sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 10 ++--- llvm/include/llvm/IR/IntrinsicsAArch64.td | 18 ++------- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 10 ++--- llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +++ llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll | 30 +++----------- .../test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll | 37 +++--------------- .../test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll | 39 +++---------------- 12 files changed, 53 insertions(+), 137 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index c63da3308d6a0..3901c88323ff4 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1190,22 +1190,22 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc } let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in { -def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">; +def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>; } let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in { -def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">; +def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>; let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in { - def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>; + def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>; } let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in { - def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>; + def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>; } let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in { - def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>; + def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>; } def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">; diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c index bebaa059e5c84..ef74024f7b091 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c @@ -10,24 +10,24 @@ #include #ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#define SVE_ACLE_FUNC(A1, A3) A1##A3 #else -#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#define SVE_ACLE_FUNC(A1, A2) A1##A2 #endif + // CHECK-LABEL: define dso_local @test_f32f16( // CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16( [[ACC]], [[A]], [[B]]) // CHECK-NEXT: ret [[TMP0]] // // CPP-CHECK-LABEL: define dso_local @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_( // CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CPP-CHECK-NEXT: [[ENTRY:.*:]] -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16( [[ACC]], [[A]], [[B]]) // CPP-CHECK-NEXT: ret [[TMP0]] // svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) { - return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b); + return SVE_ACLE_FUNC(svmmla, _f32_f16)(acc, a, b); } diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c index 10442f4e31153..7d1efb7b6d954 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c @@ -17,12 +17,12 @@ // CHECK-LABEL: @test_svmmla_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) // CHECK-NEXT: ret [[TMP0]] // // CPP-CHECK-LABEL: @_Z15test_svmmla_f32u13__SVFloat32_tS_S_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) // CPP-CHECK-NEXT: ret [[TMP0]] // svfloat32_t test_svmmla_f32(svfloat32_t x, svfloat32_t y, svfloat32_t z) { diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c index 8586a65fa240f..da211c4fba324 100644 --- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c @@ -17,12 +17,12 @@ // CHECK-LABEL: @test_svmmla_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv2f64( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) // CHECK-NEXT: ret [[TMP0]] // // CPP-CHECK-LABEL: @_Z15test_svmmla_f64u13__SVFloat64_tS_S_( // CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv2f64( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64( [[X:%.*]], [[Y:%.*]], [[Z:%.*]]) // CPP-CHECK-NEXT: ret [[TMP0]] // svfloat64_t test_svmmla_f64(svfloat64_t x, svfloat64_t y, svfloat64_t z) { diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c index a19ad0576bb4b..81f5968cd5d66 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c @@ -11,25 +11,25 @@ #ifdef SVE_OVERLOADED_FORMS // A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3 #else -#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3 #endif // CHECK-LABEL: define dso_local @test_f16mf8( // CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8( [[ACC]], [[A]], [[B]]) // CHECK-NEXT: ret [[TMP0]] // // CPP-CHECK-LABEL: define dso_local @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( // CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CPP-CHECK-NEXT: [[ENTRY:.*:]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8( [[ACC]], [[A]], [[B]]) // CPP-CHECK-NEXT: ret [[TMP0]] // svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { - return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr); + return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm)(acc, a, b, fpmr); } diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c index 526f2b1f45927..8af71a6a0500f 100644 --- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c @@ -12,25 +12,25 @@ #ifdef SVE_OVERLOADED_FORMS // A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3 #else -#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3 #endif // CHECK-LABEL: define dso_local @test_f32mf8( // CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8( [[ACC]], [[A]], [[B]]) // CHECK-NEXT: ret [[TMP0]] // // CPP-CHECK-LABEL: define dso_local @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( // CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CPP-CHECK-NEXT: [[ENTRY:.*:]] // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8( [[ACC]], [[A]], [[B]]) // CPP-CHECK-NEXT: ret [[TMP0]] // svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { - return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr); + return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm)(acc, a, b, fpmr); } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 832f97fc95959..c1c202c9bd64e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2805,22 +2805,12 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; // // SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions // -def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic; -def int_aarch64_sve_fmmla_f16f32 - : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], - [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ], - [IntrNoMem]>; - -def int_aarch64_sve_fmmla_mf8f32 - : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], - [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], - [IntrNoMem]>; +def int_aarch64_sve_fmmla + : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1> ], + [ IntrNoMem ]>; -def int_aarch64_sve_fmmla_mf8f16 - : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty], - [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], - [IntrNoMem]>; // // SVE ACLE: 7.2. BFloat16 extensions // diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c756873d0bf7e..37823f8795f6b 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in { } // End HasSVE, HasMatMulFP32 let Predicates = [HasSVE_F16F32MM] in { - defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>; + defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla, nxv4f32, nxv8f16>; } // End HasSVE_F16F32MM let Predicates = [HasSVE, HasMatMulFP64] in { @@ -4744,15 +4744,11 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_ } // End HasSSVE_FP8FMA let Predicates = [HasSVE2, HasF8F32MM] in { - def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">; - def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), - (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>; + defm FMMLA_ZZZ_BtoS : sve2_fp8_fmmla<0b0, ZPR32, "fmmla", nxv4f32>; } let Predicates = [HasSVE2, HasF8F16MM] in { - def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">; - def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)), - (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>; + defm FMMLA_ZZZ_BtoH : sve2_fp8_fmmla<0b1, ZPR16, "fmmla", nxv8f16>; } let Predicates = [HasSSVE_FP8DOT2] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 1664f4ad0c8fa..0694b623bfa67 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -11143,6 +11143,12 @@ class sve2_fp8_mmla let Uses = [FPMR, FPCR]; } +multiclass sve2_fp8_fmmla opc, ZPRRegOp zprty, string mnemonic, ValueType ResVT> { + def NAME : sve2_fp8_mmla; + def : Pat<(ResVT (int_aarch64_sve_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (!cast(NAME) $acc, $zn, $zm)>; +} + class sve_fp8_dot_indexed opc, ZPRRegOp dst_ty, Operand iop_ty, string mnemonic> : I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, iop_ty:$iop), mnemonic, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll index ea636d65a479c..aa856a557d1ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll +++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll @@ -1,32 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s -define @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_( %acc, %a, %b) { -; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_: +define @fmmla_f32f16( %acc, %a, %b) #0 { +; CHECK-LABEL: fmmla_f32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: str z0, [sp, #2, mul vl] ; CHECK-NEXT: fmmla z0.s, z1.h, z2.h -; CHECK-NEXT: str z1, [sp, #1, mul vl] -; CHECK-NEXT: str z2, [sp] -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: - %acc.addr = alloca , align 16 - %a.addr = alloca , align 16 - %b.addr = alloca , align 16 - store %acc, ptr %acc.addr, align 16 - store %a, ptr %a.addr, align 16 - store %b, ptr %b.addr, align 16 - %0 = load , ptr %acc.addr, align 16 - %1 = load , ptr %a.addr, align 16 - %2 = load , ptr %b.addr, align 16 - %3 = call @llvm.aarch64.sve.fmmla.f16f32( %0, %1, %2) - ret %3 + %out = call @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16( %acc, %a, %b) + ret %out } - -declare @llvm.aarch64.sve.fmmla.f16f32(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll index 0fdd6bf2508e3..99e23e7ab9fd5 100644 --- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll @@ -1,39 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s -define @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 %fpmr) { -; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m: +define @fmmla_f16mf8( %acc, %a, %b, i64 %fpmr) { +; CHECK-LABEL: fmmla_f16mf8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl x8, sp, #3 -; CHECK-NEXT: str z1, [sp, #1, mul vl] -; CHECK-NEXT: str z0, [sp, #2, mul vl] -; CHECK-NEXT: str z2, [sp] -; CHECK-NEXT: str x0, [x8, #8] -; CHECK-NEXT: msr FPMR, x0 ; CHECK-NEXT: fmmla z0.h, z1.b, z2.b -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: - %acc.addr = alloca , align 16 - %a.addr = alloca , align 16 - %b.addr = alloca , align 16 - %fpmr.addr = alloca i64, align 8 - store %acc, ptr %acc.addr, align 16 - store %a, ptr %a.addr, align 16 - store %b, ptr %b.addr, align 16 - store i64 %fpmr, ptr %fpmr.addr, align 8 - %0 = load , ptr %acc.addr, align 16 - %1 = load , ptr %a.addr, align 16 - %2 = load , ptr %b.addr, align 16 - %3 = load i64, ptr %fpmr.addr, align 8 - call void @llvm.aarch64.set.fpmr(i64 %3) - %4 = call @llvm.aarch64.sve.fmmla.mf8f16( %0, %1, %2) - ret %4 + %out = call @llvm.aarch64.sve.fmmla.mf8f16( %acc, %a, %b) + ret %out } - -declare @llvm.aarch64.sve.fmmla.mf8f16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll index 007a164ac77da..503baf484ecd6 100644 --- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll @@ -1,41 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s -define dso_local @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 noundef %fpmr) #0 { -; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m: +define dso_local @fmmla_f32mf8( %acc, %a, %b, i64 noundef %fpmr) #0 { +; CHECK-LABEL: fmmla_f32mf8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl x8, sp, #3 -; CHECK-NEXT: str z1, [sp, #1, mul vl] -; CHECK-NEXT: str z0, [sp, #2, mul vl] -; CHECK-NEXT: str z2, [sp] -; CHECK-NEXT: str x0, [x8, #8] -; CHECK-NEXT: msr FPMR, x0 ; CHECK-NEXT: fmmla z0.s, z1.b, z2.b -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: - %acc.addr = alloca , align 16 - %a.addr = alloca , align 16 - %b.addr = alloca , align 16 - %fpmr.addr = alloca i64, align 8 - store %acc, ptr %acc.addr, align 16 - store %a, ptr %a.addr, align 16 - store %b, ptr %b.addr, align 16 - store i64 %fpmr, ptr %fpmr.addr, align 8 - %0 = load , ptr %acc.addr, align 16 - %1 = load , ptr %a.addr, align 16 - %2 = load , ptr %b.addr, align 16 - %3 = load i64, ptr %fpmr.addr, align 8 - call void @llvm.aarch64.set.fpmr(i64 %3) - %4 = call @llvm.aarch64.sve.fmmla.mf8f32( %0, %1, %2) - ret %4 + %out = call @llvm.aarch64.sve.fmmla.mf8f32( %acc, %a, %b) + ret %out } - -declare void @llvm.aarch64.set.fpmr(i64) - -declare @llvm.aarch64.sve.fmmla.mf8f32(, , )