diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 1ac6d5170ea28..77ea53fb83fac 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in { def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>; } +let TargetGuard = "sme-f16f16" in { + def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>; + def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>; + + def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>; + def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>; + + def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; +} + +let TargetGuard = "sme2,b16b16" in { + def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>; + def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>; + + def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>; + def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>; + def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>; + + def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; + def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>; +} + // FMLAL/FMLSL/UMLAL/SMLAL // SMLALL/UMLALL/USMLALL/SUMLALL let TargetGuard = "sme2" in { diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c new file mode 100644 index 0000000000000..ecc4155454145 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c @@ -0,0 +1,592 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null + +// REQUIRES: aarch64-registered-target +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5 +#else +#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5 +#endif + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x2j13svfloat16x2_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmls_za16_f16_vg1x4j13svfloat16x4_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZM]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls,,_za16,_f16,_vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmla_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_lane_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x2)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_f16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z30test_svmls_lane_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_lane_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls_lane,,_za16,_f16,_vg1x4)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_single_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmla_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla, _single, _za16, _bf16, _vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_single_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_single_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z33test_svmls_single_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_single_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls, _single, _za16, _bf16, _vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x2j14svbfloat16x2_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmla_za16_bf16_vg1x4j14svbfloat16x4_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla, , _za16, _bf16, _vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x2j14svbfloat16x2_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x2)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmls_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 0) +// CHECK-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svmls_za16_bf16_vg1x4j14svbfloat16x4_tS_( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 0) +// CHECK-CXX-NEXT: [[TMP5:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 8) +// CHECK-CXX-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 16) +// CHECK-CXX-NEXT: [[TMP7:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZM]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[TMP4]], [[TMP5]], [[TMP6]], [[TMP7]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls, , _za16, _bf16, _vg1x4)(slice, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmla_lane_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmla_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmla_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmla_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_lane_za16_bf16_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x2)(slice, zn, zm, 7); +} + +// CHECK-LABEL: define dso_local void @test_svmls_lane_za16_bf16_vg1x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z31test_svmls_lane_za16_bf16_vg1x4j14svbfloat16x4_tu14__SVBfloat16_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]], [[ZM]], i32 7) +// CHECK-CXX-NEXT: ret void +// +void test_svmls_lane_za16_bf16_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmls_lane, , _za16, _bf16, _vg1x4)(slice, zn, zm, 7); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c new file mode 100644 index 0000000000000..b1582569971d4 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c @@ -0,0 +1,90 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s + +// REQUIRES: aarch64-registered-target + +#include + + +void test_features_f16f16(uint32_t slice, + svfloat16_t zm, + svfloat16x2_t zn2, svfloat16x2_t zm2, + svfloat16x4_t zn4, svfloat16x4_t zm4, + svbfloat16_t bzm, + svbfloat16x2_t bzn2, svbfloat16x2_t bzm2, + svbfloat16x4_t bzn4, svbfloat16x4_t bzm4) + + __arm_streaming __arm_inout("za") { + // expected-error@+1 {{'svmla_single_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmla_single_za16_f16_vg1x2(slice, zn2, zm); + // expected-error@+1 {{'svmla_single_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmla_single_za16_f16_vg1x4(slice, zn4, zm); + // expected-error@+1 {{'svmls_single_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmls_single_za16_f16_vg1x2(slice, zn2, zm); + // expected-error@+1 {{'svmls_single_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmls_single_za16_f16_vg1x4(slice, zn4, zm); + // expected-error@+1 {{'svmla_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmla_za16_f16_vg1x2(slice, zn2, zm2); + // expected-error@+1 {{'svmla_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmla_za16_f16_vg1x4(slice, zn4, zm4); + // expected-error@+1 {{'svmls_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmls_za16_f16_vg1x2(slice, zn2, zm2); + // expected-error@+1 {{'svmls_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmls_za16_f16_vg1x4(slice, zn4, zm4); + // expected-error@+1 {{'svmla_lane_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmla_lane_za16_f16_vg1x2(slice, zn2, zm, 7); + // expected-error@+1 {{'svmla_lane_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmla_lane_za16_f16_vg1x4(slice, zn4, zm, 7); + // expected-error@+1 {{'svmls_lane_za16_f16_vg1x2' needs target feature sme-f16f16}} + svmls_lane_za16_f16_vg1x2(slice, zn2, zm, 7); + // expected-error@+1 {{'svmls_lane_za16_f16_vg1x4' needs target feature sme-f16f16}} + svmls_lane_za16_f16_vg1x4(slice, zn4, zm, 7); + + // expected-error@+1 {{'svmla_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmla_single_za16_bf16_vg1x2(slice, bzn2, bzm); + // expected-error@+1 {{'svmla_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmla_single_za16_bf16_vg1x4(slice, bzn4, bzm); + // expected-error@+1 {{'svmls_single_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmls_single_za16_bf16_vg1x2(slice, bzn2, bzm); + // expected-error@+1 {{'svmls_single_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmls_single_za16_bf16_vg1x4(slice, bzn4, bzm); + // expected-error@+1 {{'svmla_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmla_za16_bf16_vg1x2(slice, bzn2, bzm2); + // expected-error@+1 {{'svmla_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmla_za16_bf16_vg1x4(slice, bzn4, bzm4); + // expected-error@+1 {{'svmls_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmls_za16_bf16_vg1x2(slice, bzn2, bzm2); + // expected-error@+1 {{'svmls_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmls_za16_bf16_vg1x4(slice, bzn4, bzm4); + // expected-error@+1 {{'svmla_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7); + // expected-error@+1 {{'svmla_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7); + // expected-error@+1 {{'svmls_lane_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, 7); + // expected-error@+1 {{'svmls_lane_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, 7); +} + + +void test_imm(uint32_t slice, svfloat16_t zm, svfloat16x2_t zn2,svfloat16x4_t zn4, + svbfloat16_t bzm, svbfloat16x2_t bzn2, svbfloat16x4_t bzn4) + __arm_streaming __arm_inout("za") { + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmla_lane_za16_f16_vg1x2(slice, zn2, zm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmla_lane_za16_f16_vg1x4(slice, zn4, zm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmls_lane_za16_f16_vg1x2(slice, zn2, zm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmls_lane_za16_f16_vg1x4(slice, zn4, zm, -1); + + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmla_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmla_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmls_lane_za16_bf16_vg1x2(slice, bzn2, bzm, -1); + // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmls_lane_za16_bf16_vg1x4(slice, bzn4, bzm, -1); +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 574178c8d5244..a2e8c530c1dff 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -797,22 +797,20 @@ defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16 defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; -} -let Predicates = [HasSMEF16F16] in { -defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>; -defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>; -defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>; -defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; - -defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>; -defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>; -defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>; -defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x2>; +defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_lane_vg1x4>; +defm FMLA_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x2>; +defm FMLA_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmla_single_vg1x4>; +defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x2>; +defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmla_vg1x4>; + +defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x2>; +defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_lane_vg1x4>; +defm FMLS_VG2_M2ZZ_H : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x2>; +defm FMLS_VG4_M4ZZ_H : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmls_single_vg1x4>; +defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x2>; +defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmls_vg1x4>; defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>; defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>; @@ -827,20 +825,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>; -defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>; -defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>; -defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; - -defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>; -defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>; -defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>; -defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>; -defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x2>; +defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_lane_vg1x4>; +defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg2_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x2>; +defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmla_single_vg1x4>; +defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x2>; +defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmla_vg1x4>; +defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x2>; +defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_lane_vg1x4>; +defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg2_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x2>; +defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg4_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmls_single_vg1x4>; +defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x2>; +defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmls_vg1x4>; defm BFMAX_VG2_2ZZ : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>; defm BFMAX_VG4_4ZZ : sme2p1_bf_max_min_vector_vg4_single<"bfmax", 0b0010000>; @@ -909,9 +906,9 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">; } //[HasSME2p1, HasSME_LUTv2] let Predicates = [HasSMEF8F16] in { -defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>; -defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_16b<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>; -defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_16b<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>; +defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>; +defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>; +defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>; defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>; defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>; // TODO: Replace nxv16i8 by nxv16f8 diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 3363aab4b093c..724dd07225cde 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -2448,9 +2448,29 @@ multiclass sme2_multi_vec_array_vg2_index_32b sz, bits< } // SME2.1 multi-vec ternary indexed two registers 16-bit -// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers multiclass sme2p1_multi_vec_array_vg2_index_16b sz, bits<3> op, - RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> { + RegisterOperand multi_vector_ty, ZPRRegOp vector_ty, + ValueType vt, SDPatternOperator intrinsic> { + def NAME : sme2_multi_vec_array_vg2_index, SMEPseudo2Instr { + bits<3> i; + let Inst{11-10} = i{2-1}; + let Inst{3} = i{0}; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, + multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>; +} + +// SME2 multi-vec indexed FP8 two-way dot product to FP16 two registers +multiclass sme2p1_multi_vec_array_vg2_index_f8f16 sz, bits<3> op, + RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> { def NAME : sme2_multi_vec_array_vg2_index { @@ -2569,10 +2589,10 @@ multiclass sme2_multi_vec_array_vg4_index_32b op, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexS32b_timm:$i), 0>; } -// SME2.1 multi-vec ternary indexed four registers 16-bit -multiclass sme2p1_multi_vec_array_vg4_index_16b op, - RegisterOperand multi_vector_ty, - ZPRRegOp zpr_ty> { +// SME2.1 multi-vec ternary indexed four registers 16-bit (FP8) +multiclass sme2p1_multi_vec_array_vg4_index_f8f16 op, + RegisterOperand multi_vector_ty, + ZPRRegOp zpr_ty> { def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16, multi_vector_ty, zpr_ty, VectorIndexH, mnemonic>{ @@ -2586,6 +2606,28 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b op, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>; } +// SME2.1 multi-vec ternary indexed four registers 16-bit +multiclass sme2p1_multi_vec_array_vg4_index_16b op, + RegisterOperand multi_vector_ty, + ZPRRegOp vector_ty, ValueType vt, + SDPatternOperator intrinsic> { + def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16, + multi_vector_ty, vector_ty, + VectorIndexH, mnemonic>, SMEPseudo2Instr { + bits<3> i; + let Inst{11-10} = i{2-1}; + let Inst{3} = i{0}; + } + + def _PSEUDO : sme2_za_array_2op_multi_index_pseudo; + + def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat; + + def : InstAlias(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, + sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexH:$i), 0>; +} + // SME2 multi-vec ternary indexed four registers 64-bit class sme2_multi_vec_array_vg4_index_64b op, RegisterOperand multi_vector_ty, diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll new file mode 100644 index 0000000000000..3e807b7e63384 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll @@ -0,0 +1,462 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "// kill:.*$" --version 4 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @test_fmla_f16_vg2_single(i32 %slice, %a0, %a1, %b) #0 { +; CHECK-LABEL: test_fmla_f16_vg2_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h +; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h +; CHECK: ret + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice, %a0, %a1, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 %slice.7, %a0, %a1, %b) + ret void +} + +define void @test_fmla_f16_vg4_single(i32 %slice, %a0, %a1, +; CHECK-LABEL: test_fmla_f16_vg4_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h +; CHECK: fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h +; CHECK: ret + %a2, %a3, %b) #0 { + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice, %a0, %a1, + %a2, %a3, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 %slice.7, %a0, %a1, + %a2, %a3, %b) + ret void +} + +define void @test_fmls_f16_vg2_single(i32 %slice, %a0, %a1, %b) #0 { +; CHECK-LABEL: test_fmls_f16_vg2_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h +; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h +; CHECK: ret + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice, %a0, %a1, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 %slice.7, %a0, %a1, %b) + ret void +} + +define void @test_fmls_f16_vg4_single(i32 %slice, %a0, %a1, +; CHECK-LABEL: test_fmls_f16_vg4_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h +; CHECK: fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h +; CHECK: ret + %a2, %a3, %b) #0 { + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice, %a0, %a1, + %a2, %a3, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 %slice.7, %a0, %a1, + %a2, %a3, %b) + ret void +} + +define void @test_fmla_f16_vg2_multi(i32 %slice, +; CHECK-LABEL: test_fmla_f16_vg2_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: ret + %a0, %a1, + %b0, %b1) #0 { + call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice, + %a0, %a1, + %b0, %b1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 %slice.7, + %a0, %a1, + %b0, %b1) + ret void +} + +define void @test_fmla_f16_vg4_multi(i32 %slice, +; CHECK-LABEL: test_fmla_f16_vg4_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: fmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) #0 { + call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 %slice.7, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + ret void +} + +define void @test_fmls_f16_vg2_multi(i32 %slice, +; CHECK-LABEL: test_fmls_f16_vg2_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: ret + %a0, %a1, + %b0, %b1) #0 { + call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice, + %a0, %a1, + %b0, %b1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x2.nxv8f16(i32 %slice.7, + %a0, %a1, + %b0, %b1) + ret void +} + +define void @test_fmls_f16_vg4_multi(i32 %slice, +; CHECK-LABEL: test_fmls_f16_vg4_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: fmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) #0 { + call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x4.nxv8f16(i32 %slice.7, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + ret void +} + +define void @test_fmla_f16_vg2_index(i32 %slice, +; CHECK-LABEL: test_fmla_f16_vg2_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: fmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: ret + %a0, %a1, + %b) #0 { + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice, + %a0, %a1, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8f16(i32 %slice.7, + %a0, %a1, + %b, i32 7); + ret void +} + +define void @test_fmla_f16_vg4_index(i32 %slice, +; CHECK-LABEL: test_fmla_f16_vg4_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b) #0 { + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + ret void +} + +define void @test_fmls_f16_vg2_index(i32 %slice, +; CHECK-LABEL: test_fmls_f16_vg2_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: fmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: ret + %a0, %a1, + %b) #0 { + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice, + %a0, %a1, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8f16(i32 %slice.7, + %a0, %a1, + %b, i32 7); + ret void +} + +define void @test_fmls_f16_vg4_index(i32 %slice, +; CHECK-LABEL: test_fmls_f16_vg4_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: fmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b) #0 { + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8f16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + ret void +} + +define void @test_fmla_bf16_vg2_single(i32 %slice, %a0, %a1, %b) #0 { +; CHECK-LABEL: test_fmla_bf16_vg2_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h +; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h +; CHECK: ret + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice, %a0, %a1, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8bf16(i32 %slice.7, %a0, %a1, %b) + ret void +} + +define void @test_fmla_bf16_vg4_single(i32 %slice, %a0, %a1, +; CHECK-LABEL: test_fmla_bf16_vg4_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h +; CHECK: bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h +; CHECK: ret + %a2, %a3, %b) #0 { + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice, %a0, %a1, + %a2, %a3, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8bf16(i32 %slice.7, %a0, %a1, + %a2, %a3, %b) + ret void +} + +define void @test_fmls_bf16_vg2_single(i32 %slice, %a0, %a1, %b) #0 { +; CHECK-LABEL: test_fmls_bf16_vg2_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h +; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h +; CHECK: ret + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice, %a0, %a1, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8bf16(i32 %slice.7, %a0, %a1, %b) + ret void +} + +define void @test_fmls_bf16_vg4_single(i32 %slice, %a0, %a1, +; CHECK-LABEL: test_fmls_bf16_vg4_single: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h +; CHECK: bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, z4.h +; CHECK: ret + %a2, %a3, %b) #0 { + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice, %a0, %a1, + %a2, %a3, %b) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8bf16(i32 %slice.7, %a0, %a1, + %a2, %a3, %b) + ret void +} + +define void @test_fmla_bf16_vg2_multi(i32 %slice, +; CHECK-LABEL: test_fmla_bf16_vg2_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: ret + %a0, %a1, + %b0, %b1) #0 { + call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice, + %a0, %a1, + %b0, %b1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x2.nxv8bf16(i32 %slice.7, + %a0, %a1, + %b0, %b1) + ret void +} + +define void @test_fmla_bf16_vg4_multi(i32 %slice, +; CHECK-LABEL: test_fmla_bf16_vg4_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: bfmla za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) #0 { + call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.vg1x4.nxv8bf16(i32 %slice.7, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + ret void +} + +define void @test_fmls_bf16_vg2_multi(i32 %slice, +; CHECK-LABEL: test_fmls_bf16_vg2_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, { z2.h, z3.h } +; CHECK: ret + %a0, %a1, + %b0, %b1) #0 { + call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice, + %a0, %a1, + %b0, %b1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x2.nxv8bf16(i32 %slice.7, + %a0, %a1, + %b0, %b1) + ret void +} + +define void @test_fmls_bf16_vg4_multi(i32 %slice, +; CHECK-LABEL: test_fmls_bf16_vg4_multi: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: bfmls za.h[w8, 7, vgx4], { z0.h - z3.h }, { z4.h - z7.h } +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) #0 { + call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.vg1x4.nxv8bf16(i32 %slice.7, + %a0, %a1, + %a2, %a3, + %b0, %b1, + %b2, %b3) + ret void +} + +define void @test_fmla_bf16_vg2_index(i32 %slice, +; CHECK-LABEL: test_fmla_bf16_vg2_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: bfmla za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: ret + %a0, %a1, + %b) #0 { + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice, + %a0, %a1, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv8bf16(i32 %slice.7, + %a0, %a1, + %b, i32 7); + ret void +} + +define void @test_fmla_bf16_vg4_index(i32 %slice, +; CHECK-LABEL: test_fmla_bf16_vg4_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: bfmla za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b) #0 { + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + ret void +} + +define void @test_fmls_bf16_vg2_index(i32 %slice, +; CHECK-LABEL: test_fmls_bf16_vg2_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: bfmls za.h[w8, 7, vgx2], { z0.h, z1.h }, z2.h[7] +; CHECK: ret + %a0, %a1, + %b) #0 { + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice, + %a0, %a1, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv8bf16(i32 %slice.7, + %a0, %a1, + %b, i32 7); + ret void +} + +define void @test_fmls_bf16_vg4_index(i32 %slice, +; CHECK-LABEL: test_fmls_bf16_vg4_index: +; CHECK: // %bb.0: +; CHECK: mov w8, w0 +; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: bfmls za.h[w8, 0, vgx4], { z0.h - z3.h }, z4.h[7] +; CHECK: ret + %a0, %a1, + %a2, %a3, + %b) #0 { + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv8bf16(i32 %slice, + %a0, %a1, + %a2, %a3, + %b, i32 7); + ret void +} + +attributes #0 = { nounwind "target-features"="+sme2p1,+sme-f16f16,+b16b16" }