diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 1ac6d5170ea28..7d292b2963093 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -674,3 +674,27 @@ let TargetGuard = "sme2" in { def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>; def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; } + +//////////////////////////////////////////////////////////////////////////////// +// SME2p1 - FMOPA, FMOPS (non-widening) +let TargetGuard = "sme2,b16b16" in { + def SVMOPA_BF16_NW : SInst<"svmopa_za16[_bf16]_m", "viPPdd", "b", + MergeNone, "aarch64_sme_mopa", + [IsStreaming, IsInOutZA], + [ImmCheck<0, ImmCheck0_1>]>; + def SVMOPS_BF16_NW : SInst<"svmops_za16[_bf16]_m", "viPPdd", "b", + MergeNone, "aarch64_sme_mops", + [IsStreaming, IsInOutZA], + [ImmCheck<0, ImmCheck0_1>]>; +} + +let TargetGuard = "sme-f16f16" in { + def SVMOPA_F16_NW : SInst<"svmopa_za16[_f16]_m", "viPPdd", "h", + MergeNone, "aarch64_sme_mopa", + [IsStreaming, IsInOutZA], + [ImmCheck<0, ImmCheck0_1>]>; + def SVMOPS_F16_NW : SInst<"svmops_za16[_f16]_m", "viPPdd", "h", + MergeNone, "aarch64_sme_mops", + [IsStreaming, IsInOutZA], + [ImmCheck<0, ImmCheck0_1>]>; +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c new file mode 100644 index 0000000000000..626bb6d3cf6f7 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c @@ -0,0 +1,97 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -O2 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK-CXX + +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +b16b16 -target-feature +sme-f16f16 -S -O2 -Werror -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: define dso_local void @test_svmopa_za16_bf16( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmopa_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_( +// CHECK-CXX-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmopa_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmopa_za16, _bf16, _m)(0, pn, pm, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmops_za16_bf16( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z21test_svmops_za16_bf16u10__SVBool_tS_u14__SVBfloat16_tS0_( +// CHECK-CXX-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmops_za16_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmops_za16, _bf16, _m)(0, pn, pm, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmopa_za16_f16( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmopa_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_( +// CHECK-CXX-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mopa.nxv8f16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmopa_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmopa_za16, _f16, _m)(0, pn, pm, zn, zm); +} + +// CHECK-LABEL: define dso_local void @test_svmops_za16_f16( +// CHECK-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z20test_svmops_za16_f16u10__SVBool_tS_u13__SVFloat16_tS0_( +// CHECK-CXX-SAME: [[PN:%.*]], [[PM:%.*]], [[ZN:%.*]], [[ZM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PN]]) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( [[PM]]) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.mops.nxv8f16(i32 0, [[TMP0]], [[TMP1]], [[ZN]], [[ZM]]) +// CHECK-CXX-NEXT: ret void +// +void test_svmops_za16_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmops_za16, _f16, _m)(0, pn, pm, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c new file mode 100644 index 0000000000000..201ad4b8ff7f0 --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_mopa_nonwide.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -verify -emit-llvm %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(svbool_t pn, svbool_t pm, + svfloat16_t zn, svfloat16_t zm, + svbfloat16_t znb, svbfloat16_t zmb) + __arm_streaming __arm_inout("za") { +// expected-error@+1 {{'svmopa_za16_bf16_m' needs target feature sme2,b16b16}} + svmopa_za16_bf16_m(0, pn, pm, znb, zmb); +// expected-error@+1 {{'svmops_za16_bf16_m' needs target feature sme2,b16b16}} + svmops_za16_bf16_m(0, pn, pm, znb, zmb); +// expected-error@+1 {{'svmopa_za16_f16_m' needs target feature sme-f16f16}} + svmopa_za16_f16_m(0, pn, pm, zn, zm); +// expected-error@+1 {{'svmops_za16_f16_m' needs target feature sme-f16f16}} + svmops_za16_f16_m(0, pn, pm, zn, zm); +} + +void test_imm(svbool_t pn, svbool_t pm, + svfloat16_t zn, svfloat16_t zm, + svbfloat16_t znb, svbfloat16_t zmb) + __arm_streaming __arm_inout("za") { +// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmopa_za16_bf16_m(-1, pn, pm, znb, zmb); +// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmops_za16_bf16_m(-1, pn, pm, znb, zmb); +// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmopa_za16_f16_m(-1, pn, pm, zn, zm); +// expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmops_za16_f16_m(-1, pn, pm, zn, zm); +} + diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index e31e00a9c76f3..e0630a6649dd7 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3649,3 +3649,6 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic; def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic; def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic; + +def int_aarch64_sme_mopa_nonwide : SME_OuterProduct_Intrinsic; +def int_aarch64_sme_mops_nonwide : SME_OuterProduct_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 574178c8d5244..97d84bcffffa8 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -817,8 +817,8 @@ defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, defm FCVT_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>; defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>; -defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>; -defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>; +defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, nxv8f16, int_aarch64_sme_mopa>; +defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, nxv8f16, int_aarch64_sme_mops>; } let Predicates = [HasSME2, HasB16B16] in { @@ -865,8 +865,8 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm", 0b0010011 defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">; defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">; -defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>; -defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>; +defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, nxv8bf16, int_aarch64_sme_mopa>; +defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, nxv8bf16, int_aarch64_sme_mops>; } let Predicates = [HasSME2, HasFP8] in { @@ -928,7 +928,7 @@ defm FMLAL_VG4_M4ZZ_BtoH : sme2_fp_mla_long_array_vg4_single<"fmlal", 0b001, M defm FMLAL_VG2_M2Z2Z_BtoH : sme2_fp_mla_long_array_vg2_multi<"fmlal", 0b100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>; defm FMLAL_VG4_M4Z4Z_BtoH : sme2_fp_mla_long_array_vg4_multi<"fmlal", 0b100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>; -defm FMOPA_MPPZZ_BtoH : sme2p1_fmop_tile_fp16<"fmopa", 0b1, 0b0, 0b01, ZPR8>; +defm FMOPA_MPPZZ_BtoH : sme2p1_fmop_tile_f8f16<"fmopa", 0b1, 0b0, 0b01>; } //[HasSMEF8F16] diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 3363aab4b093c..e7c2010907b4a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -286,14 +286,26 @@ multiclass sme_outer_product_fp64 def : SME_ZA_Tile_TwoPred_TwoVec_Pat; } -multiclass sme2p1_fmop_tile_fp16 op, ZPRRegOp zpr_ty>{ - def NAME : sme_fp_outer_product_inst { +multiclass sme2p1_fmop_tile_f8f16 op> { + def NAME : sme_fp_outer_product_inst { bits<1> ZAda; let Inst{2-1} = 0b00; let Inst{0} = ZAda; } } +multiclass sme2p1_fmop_tile_fp16 { + def NAME : sme_fp_outer_product_inst, SMEPseudo2Instr { + bits<1> ZAda; + let Inst{2-1} = 0b00; + let Inst{0} = ZAda; + } + + def NAME # _PSEUDO : sme_outer_product_pseudo, SMEPseudo2Instr; + + def : SME_ZA_Tile_TwoPred_TwoVec_Pat; +} + class sme_int_outer_product_inst opc, bit sz, bit sme2, MatrixTileOperand za_ty, ZPRRegOp zpr_ty, string mnemonic> diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll new file mode 100644 index 0000000000000..fa0fd43607020 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mopa.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @mopa_bf16( %pn, %pm, %zn, %zm) #0 { +; CHECK-LABEL: mopa_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmopa za0.h, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mopa.nxv8bf16(i32 0, %pn, %pm, %zn, %zm) + ret void +} + +define void @mopa_f16( %pn, %pm, %zn, %zm) #0 { +; CHECK-LABEL: mopa_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmopa za1.h, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mopa.nxv8f16(i32 1, %pn, %pm, %zn, %zm) + ret void +} + +define void @mops_bf16( %pn, %pm, %zn, %zm) #0 { +; CHECK-LABEL: mops_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: bfmops za0.h, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mops.nxv8bf16(i32 0, %pn, %pm, %zn, %zm) + ret void +} + +define void @mops_f16( %pn, %pm, %zn, %zm) #0 { +; CHECK-LABEL: mops_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fmops za1.h, p0/m, p1/m, z0.h, z1.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mops.nxv8f16(i32 1, %pn, %pm, %zn, %zm) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme,+sme2p1,+bf16,+sme-f16f16,+b16b16" }