diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 53a0541ed290a..100be1b5e893c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -332,6 +332,9 @@ AMDGPU Support X86 Support ^^^^^^^^^^^ +- Add ISA of ``AMX-COMPLEX`` which supports ``tcmmimfp16ps`` and + ``tcmmrlfp16ps``. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 4b9e7d29d6517..e5c1fe8b31921 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,6 +117,8 @@ TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx- TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") +TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -134,6 +136,9 @@ TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") +TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0cc7052b67105..831f8dd65a3e6 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4656,6 +4656,8 @@ def m3dnowa : Flag<["-"], "m3dnowa">, Group; def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group; def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group; def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group; +def mamx_complex : Flag<["-"], "mamx-complex">, Group; +def mno_amx_complex : Flag<["-"], "mno-amx-complex">, Group; def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group; def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group; def mamx_int8 : Flag<["-"], "mamx-int8">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index ac04bf9f3dd18..0cffc76d3f2ce 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -335,6 +335,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXINT8 = true; } else if (Feature == "+amx-tile") { HasAMXTILE = true; + } else if (Feature == "+amx-complex") { + HasAMXCOMPLEX = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -799,6 +801,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_BF16__"); if (HasAMXFP16) Builder.defineMacro("__AMX_FP16__"); + if (HasAMXCOMPLEX) + Builder.defineMacro("__AMXCOMPLEX__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -912,6 +916,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("adx", true) .Case("aes", true) .Case("amx-bf16", true) + .Case("amx-complex", true) .Case("amx-fp16", true) .Case("amx-int8", true) .Case("amx-tile", true) @@ -1013,6 +1018,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("adx", HasADX) .Case("aes", HasAES) .Case("amx-bf16", HasAMXBF16) + .Case("amx-complex", HasAMXCOMPLEX) .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 816bf13770a0c..9a563db496dcd 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -154,6 +154,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXTILE = false; bool HasAMXINT8 = false; bool HasAMXBF16 = false; + bool HasAMXCOMPLEX = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUINTR = false; diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index bb9a11eabbeff..52f0e23a265cb 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -117,6 +117,7 @@ set(x86_files # Intrinsics adxintrin.h ammintrin.h + amxcomplexintrin.h amxfp16intrin.h amxintrin.h avx2intrin.h diff --git a/clang/lib/Headers/amxcomplexintrin.h b/clang/lib/Headers/amxcomplexintrin.h new file mode 100644 index 0000000000000..84ef972fcadf0 --- /dev/null +++ b/clang/lib/Headers/amxcomplexintrin.h @@ -0,0 +1,169 @@ +/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_COMPLEXINTRIN_H +#define __AMX_COMPLEXINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_COMPLEX \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-complex"))) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (row of \a a, column of \a b), it performs a set of multiplication +/// and accumulations on all corresponding complex numbers (one from \a a +/// and one from \a b). The imaginary part of the \a a element is multiplied +/// with the real part of the corresponding \a b element, and the real part +/// of the \a a element is multiplied with the imaginary part of the +/// corresponding \a b elements. The two accumulated results are added, and +/// then accumulated into the corresponding row and column of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the real part of the result. For each possible combination +/// of (row of \a a, column of \a b), it performs a set of multiplication +/// and accumulations on all corresponding complex numbers (one from \a a +/// and one from \a b). The real part of the \a a element is multiplied +/// with the real part of the corresponding \a b element, and the negated +/// imaginary part of the \a a element is multiplied with the imaginary +/// part of the corresponding \a b elements. The two accumulated results +/// are added, and then accumulated into the corresponding row and column +/// of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX +_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX +_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number with +/// FP16 real part and FP16 imaginary part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_COMPLEX +static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number with +/// FP16 real part and FP16 imaginary part. +/// This function calculates the real part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCMMRLFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_COMPLEX +static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_COMPLEXINTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 0d2e8be6e4862..8e109af0f5817 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -538,6 +538,11 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AMXCOMPLEX__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index c8b42519c88dc..af67552cba39b 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5237,6 +5237,8 @@ bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpbuud: case X86::BI__builtin_ia32_tdpbf16ps: case X86::BI__builtin_ia32_tdpfp16ps: + case X86::BI__builtin_ia32_tcmmimfp16ps: + case X86::BI__builtin_ia32_tcmmrlfp16ps: return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); } } diff --git a/clang/test/CodeGen/X86/amx_complex_api.c b/clang/test/CodeGen/X86/amx_complex_api.c new file mode 100644 index 0000000000000..113c7f62753f7 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_complex_api.c @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-bf16 \ +// RUN: -target-feature +amx-complex \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include + +char buf[1024]; +#define STRIDE 32 + +char buf2[1024]; + +void test_tile_cmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_cmmimfp16ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.tcmmimfp16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_cmmimfp16ps(&c, a, b); +} + +void test_tile_cmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_cmmrlfp16ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.tcmmrlfp16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_cmmrlfp16ps(&c, a, b); +} diff --git a/clang/test/CodeGen/X86/amxcomplex-builtins.c b/clang/test/CodeGen/X86/amxcomplex-builtins.c new file mode 100644 index 0000000000000..a5478b0395501 --- /dev/null +++ b/clang/test/CodeGen/X86/amxcomplex-builtins.c @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-complex \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s + +#include +#include +void test_tile_cmmimfp16ps(void) { + // CHECK-LABEL: @test_tile_cmmimfp16ps + // CHECK: call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3) + _tile_cmmimfp16ps(1, 2, 3); +} + +void test_tile_cmmrlfp16ps(void) { + // CHECK-LABEL: @test_tile_cmmrlfp16ps + // CHECK: call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3) + _tile_cmmrlfp16ps(1, 2, 3); +} diff --git a/clang/test/CodeGen/X86/amxcomplex-errors.c b/clang/test/CodeGen/X86/amxcomplex-errors.c new file mode 100644 index 0000000000000..3dd5ea5b01fb0 --- /dev/null +++ b/clang/test/CodeGen/X86/amxcomplex-errors.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-complex -emit-llvm -fsyntax-only -verify + +#include +#include +void test_tile_cmmimfp16ps() { + _tile_cmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} +} + +void test_tile_cmmrlfp16ps() { + _tile_cmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} +} diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index d5e40ffb1807b..6819c907f5c98 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -297,6 +297,13 @@ // AMX-FP16: "-target-feature" "+amx-fp16" // NO-AMX-FP16: "-target-feature" "-amx-fp16" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-complex %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-COMPLEX %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-complex %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-COMPLEX %s +// AMX-COMPLEX: "-target-feature" "+amx-complex" +// NO-AMX-COMPLEX: "-target-feature" "-amx-complex" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 326582446fcff..5bf38cc3b256b 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -559,6 +559,18 @@ // NO-AMX-FP16-NOT: #define __AMX_FP16__ 1 // NO-AMX-FP16-NOT: #define __AMX_TILE__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-COMPLEX %s + +// AMX-COMPLEX: #define __AMXCOMPLEX__ 1 + +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-complex -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-COMPLEX %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-COMPLEX %s + +// NO-AMX-COMPLEX-NOT: #define __AMXCOMPLEX__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 239f15809e29e..d8d8bc59987e1 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5352,6 +5352,16 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; + // AMX-COMPLEX + def int_x86_tcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tcmmimfp16ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, + ImmArg>]>; + def int_x86_tcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, + ImmArg>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5414,6 +5424,19 @@ let TargetPrefix = "x86" in { DefaultAttrsIntrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_x86_cast_tile_to_vector: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>; + + def int_x86_tcmmimfp16ps_internal : + ClangBuiltin<"__builtin_ia32_tcmmimfp16ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; + def int_x86_tcmmrlfp16ps_internal : + ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index feec0b81f5263..7ed594167734e 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -167,6 +167,7 @@ X86_FEATURE (3DNOWA, "3dnowa") X86_FEATURE (64BIT, "64bit") X86_FEATURE (ADX, "adx") X86_FEATURE (AMX_BF16, "amx-bf16") +X86_FEATURE (AMX_COMPLEX, "amx-complex") X86_FEATURE (AMX_INT8, "amx-int8") X86_FEATURE (AMX_TILE, "amx-tile") X86_FEATURE (CLDEMOTE, "cldemote") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 0f0ee6c8c5b68..7f6399c937aea 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -267,6 +267,9 @@ def FeatureAMXBF16 : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true", def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true", "Support AMX amx-fp16 instructions", [FeatureAMXTILE]>; +def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", + "Support AMX-COMPLEX instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index f50602e884cb9..21ad08fb5f0af 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -562,6 +562,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.setDesc(TII->get(Opc)); return true; } + case X86::PTCMMIMFP16PSV: + case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: case X86::PTDPBSUDV: case X86::PTDPBUSDV: @@ -573,6 +575,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.removeOperand(i); unsigned Opc; switch (Opcode) { + case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break; case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1d8327f35004c..c7eac481b6be6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38235,6 +38235,23 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTCMMIMFP16PS: + case X86::PTCMMRLFP16PS: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected instruction!"); + case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index f47a06869796a..6948deb4bc14b 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -215,3 +215,45 @@ let Predicates = [HasAMXFP16, In64BitMode] in { } } } // HasAMXTILE, HasAMXFP16 + +let Predicates = [HasAMXCOMPLEX, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + def TCMMIMFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", + []>, T8PD, VEX_4V; + def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", + []>, VEX_4V, VEX_WIG, T8PS; + + } // Constraints = "$src1 = $dst" + + let Constraints = "$src4 = $dst" in { + def PTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), + [(set TILE: $dst, + (int_x86_tcmmimfp16ps_internal GR16:$src1, GR16:$src2, + GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; + def PTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), + [(set TILE: $dst, + (int_x86_tcmmrlfp16ps_internal GR16:$src1, GR16:$src2, + GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; + } + + let usesCustomInserter = 1 in { + def PTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tcmmimfp16ps timm:$src1, + timm:$src2, timm:$src3)]>; + def PTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tcmmrlfp16ps timm:$src1, + timm:$src2, timm:$src3)]>; + } + } // SchedRW = [WriteSystem] +} diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 1c7cb1d0ecd4e..9de9941bbba46 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -1010,6 +1010,7 @@ def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">; def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; +def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 325bc3af83e84..e861420bbd193 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -129,6 +129,8 @@ static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { } // a * b + c // The shape depends on which operand. + case Intrinsic::x86_tcmmimfp16ps_internal: + case Intrinsic::x86_tcmmrlfp16ps_internal: case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 0edc0a432f8e0..9898ce20e5ac3 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1003,6 +1003,8 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTILEZEROV: case X86::PTDPBF16PSV: case X86::PTDPFP16PSV: + case X86::PTCMMIMFP16PSV: + case X86::PTCMMRLFP16PSV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 20770a49f5c61..8da45d96a8cc7 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -606,6 +606,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {}; constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll new file mode 100644 index 0000000000000..924572a5bfbdd --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +; RUN: -mattr=+amx-complex \ +; RUN: -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 +; CHECK-NEXT: tilezero %tmm1 +; CHECK-NEXT: tilezero %tmm2 +; CHECK-NEXT: tcmmimfp16ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tcmmrlfp16ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + + %c1 = call x86_amx @llvm.x86.tcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + %c2 = call x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) + + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + +declare x86_amx @llvm.x86.tcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll new file mode 100644 index 0000000000000..7774092e4210f --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-complex --show-mc-encoding | FileCheck %s + +define void @test_tcmmimfp16ps() { +; CHECK-LABEL: test_tcmmimfp16ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x61,0x6c,0xca] +; CHECK-NEXT: retq # encoding: [0xc3] + call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.tcmmimfp16ps(i8 %A, i8 %B, i8 %C) + +define void @test_tcmmrlfp16ps() { +; CHECK-LABEL: test_tcmmrlfp16ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6c,0xca] +; CHECK-NEXT: retq # encoding: [0xc3] + call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.tcmmrlfp16ps(i8 %A, i8 %B, i8 %C) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt new file mode 100644 index 0000000000000..5a0cb876d37a4 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding --disassemble < %s | FileCheck %s + +# CHECK: tcmmimfp16ps %tmm4, %tmm5, %tmm6 +0xc4,0xe2,0x59,0x6c,0xf5 + +# CHECK: tcmmimfp16ps %tmm1, %tmm2, %tmm3 +0xc4,0xe2,0x71,0x6c,0xda + +# CHECK: tcmmrlfp16ps %tmm4, %tmm5, %tmm6 +0xc4,0xe2,0x58,0x6c,0xf5 + +# CHECK: tcmmrlfp16ps %tmm1, %tmm2, %tmm3 +0xc4,0xe2,0x70,0x6c,0xda diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt new file mode 100644 index 0000000000000..556fce78366a4 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: tcmmimfp16ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x59,0x6c,0xf5 + +# CHECK: tcmmimfp16ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x71,0x6c,0xda + +# CHECK: tcmmrlfp16ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x58,0x6c,0xf5 + +# CHECK: tcmmrlfp16ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x70,0x6c,0xda diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s new file mode 100644 index 0000000000000..fd47208c6c17b --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: tcmmimfp16ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5] + tcmmimfp16ps %tmm4, %tmm5, %tmm6 + +// CHECK: tcmmimfp16ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda] + tcmmimfp16ps %tmm1, %tmm2, %tmm3 + +// CHECK: tcmmrlfp16ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5] + tcmmrlfp16ps %tmm4, %tmm5, %tmm6 + +// CHECK: tcmmrlfp16ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda] + tcmmrlfp16ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s new file mode 100644 index 0000000000000..18dce99c7716d --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tcmmimfp16ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5] + tcmmimfp16ps tmm6, tmm5, tmm4 + +// CHECK: tcmmimfp16ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda] + tcmmimfp16ps tmm3, tmm2, tmm1 + +// CHECK: tcmmrlfp16ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5] + tcmmrlfp16ps tmm6, tmm5, tmm4 + +// CHECK: tcmmrlfp16ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda] + tcmmrlfp16ps tmm3, tmm2, tmm1