[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS #88553

momchil-velikov · 2024-04-12T18:40:28Z

According to the specification in ARM-software/acle#309
add the following intrinsics

void svmla[_single]_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm)
void svmla[_single]_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm)
void svmls[_single]_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm)
void svmls[_single]_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm)

void svmla_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm)
void svmla_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm)
void svmls_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm)
void svmls_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm)

void svmla_lane_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm, uint64_t imm_idx)
void svmla_lane_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm, uint64_t imm_idx)
void svmls_lane_za16[_f16]_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm, uint64_t imm_idx)
void svmls_lane_za16[_f16]_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm, uint64_t imm_idx)

as well as the corresponding _bf16 variants.

llvmbot · 2024-04-12T18:41:02Z

@llvm/pr-subscribers-clang

Author: Momchil Velikov (momchil-velikov)

Changes

Patch is 108.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88553.diff

6 Files Affected:

(modified) clang/include/clang/Basic/arm_sme.td (+34)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c (+592)
(added) clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c (+90)
(modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+28-29)
(modified) llvm/lib/Target/AArch64/SMEInstrFormats.td (+48-6)
(added) llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll (+462)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..29bc627e7481ab 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in {
   def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
+let TargetGuard = "sme2p1,sme-f16f16" in {
+  def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let TargetGuard = "sme2,b16b16" in {
+  def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
 let TargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 00000000000000..81ab3e62276b2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,592 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_AC...
[truncated]

llvmbot · 2024-04-12T18:41:02Z

@llvm/pr-subscribers-backend-aarch64

Author: Momchil Velikov (momchil-velikov)

Changes

Patch is 108.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88553.diff

6 Files Affected:

(modified) clang/include/clang/Basic/arm_sme.td (+34)
(added) clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c (+592)
(added) clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c (+90)
(modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+28-29)
(modified) llvm/lib/Target/AArch64/SMEInstrFormats.td (+48-6)
(added) llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll (+462)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..29bc627e7481ab 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -458,6 +458,40 @@ let TargetGuard = "sme2,sme-f64f64" in {
   def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
+let TargetGuard = "sme2p1,sme-f16f16" in {
+  def SVMLA_MULTI_VG1x2_F16 : Inst<"svmla_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F16 : Inst<"svmla_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F16 : Inst<"svmls_za16[_f16]_vg1x2", "vm22", "h", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F16 : Inst<"svmls_za16[_f16]_vg1x4", "vm44", "h", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F16 : Inst<"svmla[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x2", "vm2d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F16 : Inst<"svmls[_single]_za16[_f16]_vg1x4", "vm4d", "h", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F16 : Inst<"svmla_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_F16 : Inst<"svmla_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_F16 : Inst<"svmls_lane_za16[_f16]_vg1x2", "vm2di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_F16 : Inst<"svmls_lane_za16[_f16]_vg1x4", "vm4di", "h", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let TargetGuard = "sme2,b16b16" in {
+  def SVMLA_MULTI_VG1x2_BF16 : Inst<"svmla_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_BF16 : Inst<"svmla_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_BF16 : Inst<"svmls_za16[_bf16]_vg1x2", "vm22", "b", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_BF16 : Inst<"svmls_za16[_bf16]_vg1x4", "vm44", "b", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_BF16 : Inst<"svmla[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x2", "vm2d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_BF16 : Inst<"svmls[_single]_za16[_bf16]_vg1x4", "vm4d", "b", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLA_LANE_VG1x4_BF16 : Inst<"svmla_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x2_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x2", "vm2di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLS_LANE_VG1x4_BF16 : Inst<"svmls_lane_za16[_bf16]_vg1x4", "vm4di", "b", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
 let TargetGuard = "sme2" in {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
new file mode 100644
index 00000000000000..81ab3e62276b2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c
@@ -0,0 +1,592 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1                               -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1                        -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED, A5) A1##A3##A5
+#else
+#define SME_ACLE_FUNC(A1, A2, A3, A4, A5) A1##A2##A3##A4##A5
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmla_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmls_single_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z32test_svmls_single_za16_f16_vg1x4j13svfloat16x4_tu13__SVFloat16_t(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmls_single_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmls,_single,_za16,_f16,_vg1x4)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x2j13svfloat16x2_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x half> [[ZN:%.*]], <vscale x 16 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x2(uint32_t slice, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmla,,_za16,_f16,_vg1x2)(slice, zn, zm);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmla_za16_f16_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z25test_svmla_za16_f16_vg1x4j13svfloat16x4_tS_(
+// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], <vscale x 32 x half> [[ZN:%.*]], <vscale x 32 x half> [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 0)
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 [[SLICE]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_svmla_za16_f16_vg1x4(uint32_t slice, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_inout("za") {
+  SME_AC...
[truncated]

llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas16.ll

llvm/lib/Target/AArch64/SMEInstrFormats.td

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c

clang/include/clang/Basic/arm_sme.td

tmatheson-arm · 2024-05-20T21:36:24Z

clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fmlas16.c

+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS        -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -Werror -Wall -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall %s -o /dev/null


This needs something like -target-feature +sme2

momchil-velikov requested review from hassnaaHamdi, Lukacma and CarolineConcatto April 12, 2024 18:40

llvmbot added clang Clang issues not falling into any other category backend:AArch64 clang:frontend Language frontend issues, e.g. anything involving "Sema" labels Apr 12, 2024

CarolineConcatto reviewed Apr 15, 2024

View reviewed changes

hassnaaHamdi reviewed Apr 22, 2024

View reviewed changes

clang/include/clang/Basic/arm_sme.td Show resolved Hide resolved

momchil-velikov force-pushed the sme-fmla branch 2 times, most recently from 6aea424 to 3b40ff6 Compare April 30, 2024 12:39

CarolineConcatto approved these changes May 7, 2024

View reviewed changes

momchil-velikov added 3 commits May 9, 2024 16:22

[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS

2f96bd7

[fixup] Fix incorrect dependency on sme2p1

6ee3f45

[fixup] Fix test RUN lines

42f10b3

momchil-velikov force-pushed the sme-fmla branch from 399d858 to 42f10b3 Compare May 9, 2024 15:48

momchil-velikov merged commit 64d4ade into llvm:main May 10, 2024
4 checks passed

tmatheson-arm reviewed May 20, 2024

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS #88553

[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS #88553

momchil-velikov commented Apr 12, 2024 •

edited

llvmbot commented Apr 12, 2024

llvmbot commented Apr 12, 2024

tmatheson-arm May 20, 2024

[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS #88553

[AArch64] Add intrinsics for 16-bit non-widening FMLA/FMLS #88553

Conversation

momchil-velikov commented Apr 12, 2024 • edited

llvmbot commented Apr 12, 2024

llvmbot commented Apr 12, 2024

tmatheson-arm May 20, 2024

Choose a reason for hiding this comment

momchil-velikov commented Apr 12, 2024 •

edited