diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index d2b7b78b9970f..c63da3308d6a0 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in { def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">; +let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>; +} + +let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>; +} + +let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in { + def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>; +} + def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">; def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">; def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">; diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c new file mode 100644 index 0000000000000..bebaa059e5c84 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c @@ -0,0 +1,33 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f32f16( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.f16f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) { + return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b); +} diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c new file mode 100644 index 0000000000000..a19ad0576bb4b --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f16mf8( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f16( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr); +} diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c new file mode 100644 index 0000000000000..526f2b1f45927 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c @@ -0,0 +1,36 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: define dso_local @test_f32mf8( +// CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CHECK-NEXT: ret [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m( +// CPP-CHECK-SAME: [[ACC:%.*]], [[A:%.*]], [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.fmmla.mf8f32( [[ACC]], [[A]], [[B]]) +// CPP-CHECK-NEXT: ret [[TMP0]] +// +svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) { + return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b81edc385cd43..832f97fc95959 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic; // def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic; +def int_aarch64_sve_fmmla_f16f32 + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ], + [IntrNoMem]>; + +def int_aarch64_sve_fmmla_mf8f32 + : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty], + [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], + [IntrNoMem]>; + +def int_aarch64_sve_fmmla_mf8f16 + : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty], + [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ], + [IntrNoMem]>; // // SVE ACLE: 7.2. BFloat16 extensions // diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3b268dcbca600..c756873d0bf7e 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in { } // End HasSVE, HasMatMulFP32 let Predicates = [HasSVE_F16F32MM] in { - def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>; + defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>; } // End HasSVE_F16F32MM let Predicates = [HasSVE, HasMatMulFP64] in { @@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_ let Predicates = [HasSVE2, HasF8F32MM] in { def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">; + def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>; } let Predicates = [HasSVE2, HasF8F16MM] in { def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">; + def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)), + (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>; } let Predicates = [HasSSVE_FP8DOT2] in { diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll new file mode 100644 index 0000000000000..ea636d65a479c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK + +define @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_( %acc, %a, %b) { +; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: fmmla z0.s, z1.h, z2.h +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = call @llvm.aarch64.sve.fmmla.f16f32( %0, %1, %2) + ret %3 +} + +declare @llvm.aarch64.sve.fmmla.f16f32(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll new file mode 100644 index 0000000000000..0fdd6bf2508e3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK + +define @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 %fpmr) { +; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #3 +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: str x0, [x8, #8] +; CHECK-NEXT: msr FPMR, x0 +; CHECK-NEXT: fmmla z0.h, z1.b, z2.b +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + %fpmr.addr = alloca i64, align 8 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + store i64 %fpmr, ptr %fpmr.addr, align 8 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = load i64, ptr %fpmr.addr, align 8 + call void @llvm.aarch64.set.fpmr(i64 %3) + %4 = call @llvm.aarch64.sve.fmmla.mf8f16( %0, %1, %2) + ret %4 +} + +declare @llvm.aarch64.sve.fmmla.mf8f16(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll new file mode 100644 index 0000000000000..007a164ac77da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK + +define dso_local @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m( %acc, %a, %b, i64 noundef %fpmr) #0 { +; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x8, sp, #3 +; CHECK-NEXT: str z1, [sp, #1, mul vl] +; CHECK-NEXT: str z0, [sp, #2, mul vl] +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: str x0, [x8, #8] +; CHECK-NEXT: msr FPMR, x0 +; CHECK-NEXT: fmmla z0.s, z1.b, z2.b +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %acc.addr = alloca , align 16 + %a.addr = alloca , align 16 + %b.addr = alloca , align 16 + %fpmr.addr = alloca i64, align 8 + store %acc, ptr %acc.addr, align 16 + store %a, ptr %a.addr, align 16 + store %b, ptr %b.addr, align 16 + store i64 %fpmr, ptr %fpmr.addr, align 8 + %0 = load , ptr %acc.addr, align 16 + %1 = load , ptr %a.addr, align 16 + %2 = load , ptr %b.addr, align 16 + %3 = load i64, ptr %fpmr.addr, align 8 + call void @llvm.aarch64.set.fpmr(i64 %3) + %4 = call @llvm.aarch64.sve.fmmla.mf8f32( %0, %1, %2) + ret %4 +} + +declare void @llvm.aarch64.set.fpmr(i64) + +declare @llvm.aarch64.sve.fmmla.mf8f32(, , )