Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ X86 Support
- Support ISA of ``AMX-FP8``.
- Support ISA of ``AMX-TRANSPOSE``.
- Support ISA of ``AMX-AVX512``.
- Support ISA of ``AMX-TF32``.

Arm and AArch64 Support
^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
15 changes: 11 additions & 4 deletions clang/include/clang/Basic/BuiltinsX86_64.def
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n",
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")

// AMX
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
Expand Down Expand Up @@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512")

TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")

// AMX_FP16 FP16
TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")

Expand All @@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")

// AMX TF32
TARGET_BUILTIN(__builtin_ia32_tmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32")
TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32,amx-transpose")

TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")

// RAO-INT
TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -6297,6 +6297,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group<m_x86_Features_Group>;
def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group<m_x86_Features_Group>;
def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Basic/Targets/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXTRANSPOSE = true;
} else if (Feature == "+amx-avx512") {
HasAMXAVX512 = true;
} else if (Feature == "+amx-tf32") {
HasAMXTF32 = true;
} else if (Feature == "+cmpccxadd") {
HasCMPCCXADD = true;
} else if (Feature == "+raoint") {
Expand Down Expand Up @@ -959,6 +961,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasAMXAVX512)
Builder.defineMacro("__AMX_AVX512__");
if (HasAMXTF32)
Builder.defineMacro("__AMX_TF32__");
if (HasCMPCCXADD)
Builder.defineMacro("__CMPCCXADD__");
if (HasRAOINT)
Expand Down Expand Up @@ -1090,6 +1094,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-fp16", true)
.Case("amx-fp8", true)
.Case("amx-int8", true)
.Case("amx-tf32", true)
.Case("amx-tile", true)
.Case("amx-transpose", true)
.Case("avx", true)
Expand Down Expand Up @@ -1211,6 +1216,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-fp16", HasAMXFP16)
.Case("amx-fp8", HasAMXFP8)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tf32", HasAMXTF32)
.Case("amx-tile", HasAMXTILE)
.Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
Expand Down
1 change: 1 addition & 0 deletions clang/lib/Basic/Targets/X86.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasAMXFP8 = false;
bool HasAMXTRANSPOSE = false;
bool HasAMXAVX512 = false;
bool HasAMXTF32 = false;
bool HasSERIALIZE = false;
bool HasTSXLDTRK = false;
bool HasUSERMSR = false;
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Headers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ set(x86_files
amxfp16intrin.h
amxfp8intrin.h
amxintrin.h
amxtf32intrin.h
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing the new file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry. Forgot to add it. Done. Thanks.

amxtf32transposeintrin.h
amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
Expand Down
108 changes: 108 additions & 0 deletions clang/lib/Headers/amxtf32intrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/

#ifndef __IMMINTRIN_H
#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H

#ifndef __AMX_TF32INTRIN_H
#define __AMX_TF32INTRIN_H
#ifdef __x86_64__

#define __DEFAULT_FN_ATTRS_TF32 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tf32")))

/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus
/// with \a srcdst.
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \
/// constexpr int b);
/// \endcode
///
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
///
/// \param srcdst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
///
/// \code{.operation}
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
/// dword[12:0] := 0
/// dword[31:13] := x[31:13]
/// return dword
/// }
///
/// DEFINE silence_snan_fp32(x[31:0]) {
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
/// x.fraction[22] := 1
/// return x
/// }
///
/// elements_a := a.colsb / 4
/// elements_dest := srcdst.colsb / 4
///
/// FOR m = 0 TO (srcdst.rows-1)
/// tmp[511:0] := 0
/// FOR k = 0 TO (elements_a-1)
/// FOR n = 0 TO (elements_dest-1)
/// af := silence_snan_fp32(a.row[m].fp32[k])
/// bf := silence_snan_fp32(b.row[k].fp32[n])
/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af)
/// * zero_lower_mantissa_bits_fp32(bf)
/// ENDFOR
/// ENDFOR
///
/// FOR n = 0 TO (elements_dest-1)
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
/// ENDFOR
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
///
/// ENDFOR
///
/// zero_upper_rows(srcdst, srcdst.rows)
/// zero_tileconfig_start()
/// \endcode
#define _tile_mmultf32ps(srcdst, a, b) \
__builtin_ia32_tmmultf32ps((srcdst), (a), (b))

static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32
_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2);
}

/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst.
/// All the calculation is base on float32 but with the lower 13-bit set to 0.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TF32
static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}

#endif // __x86_64__
#endif // __AMX_TF32INTRIN_H
105 changes: 105 additions & 0 deletions clang/lib/Headers/amxtf32transposeintrin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <amxtf32tranposeintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H

#ifndef __AMX_TF32TRANSPOSEINTRIN_H
#define __AMX_TF32TRANSPOSEINTRIN_H
#ifdef __x86_64__

#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \
__attribute__((__always_inline__, __nodebug__, \
__target__("amx-tf32,amx-transpose")))

/// \code
/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
/// constexpr int b);
/// \endcode
///
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
///
/// \param srcdst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
///
/// \code{.operation}
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
/// dword[12:0] := 0
/// dword[31:13] := x[31:13]
/// return dword
/// }
///
/// DEFINE silence_snan_fp32(x[31:0]) {
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
/// x.fraction[22] := 1
/// return x
/// }
///
/// elements_dest:= srcdst.colsb/4
///
/// FOR m := 0 TO (srcdst.rows-1)
/// tmp[511:0] := 0
/// FOR k := 0 TO (a.rows-1)
/// FOR n := 0 TO (elements_dest-1)
/// a1e := silence_snan_fp32(a.row[k].fp32[m])
/// a2e := silence_snan_fp32(b.row[k].fp32[n])
/// s1e := zero_lower_mantissa_bits_fp32(a1e)
/// s2e := zero_lower_mantissa_bits_fp32(a2e)
/// tmp.fp32[n] += s1e * s2e
/// ENDFOR
/// ENDFOR
///
/// FOR n := 0 TO (elements_dest-1)
/// tmp.fp32[n] += srcdst.row[m].fp32[n]
/// ENDFOR
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
///
/// ENDFOR
///
/// zero_upper_rows(srcdst, srcdst.rows)
/// zero_tileconfig_start()
/// \endcode
#define _tile_tmmultf32ps(srcdst, a, b) \
__builtin_ia32_ttmmultf32ps((srcdst), (a), (b))

// dst = m x n (srcdest), src1 = k x m, src2 = k x n
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
}

/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
/// Matrix Plus with dst. All the calculation is base on float32 but with the
/// lower 13-bit set to 0.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
dst->tile, src0.tile, src1.tile);
}

#endif // __x86_64__
#endif // __AMX_TF32TRANSPOSEINTRIN_H
9 changes: 9 additions & 0 deletions clang/lib/Headers/immintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,15 @@ _storebe_i64(void * __P, long long __D) {
#include <amxavx512intrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
#include <amxtf32intrin.h>
#endif

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
#include <amxtf32transposeintrin.h>
#endif

#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include <avx512vp2intersectintrin.h>
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/Sema/SemaX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_tdpbhf8ps:
case X86::BI__builtin_ia32_tdphbf8ps:
case X86::BI__builtin_ia32_tdphf8ps:
case X86::BI__builtin_ia32_tmmultf32ps:
case X86::BI__builtin_ia32_ttmmultf32ps:
return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
case X86::BI__builtin_ia32_ttransposed:
return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});
Expand Down
17 changes: 17 additions & 0 deletions clang/test/CodeGen/X86/amx_tf32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-tf32 \
// RUN: -target-feature +amx-transpose -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s

#include <immintrin.h>
#include <stddef.h>

void test_tile_mmultf32ps(void) {
// CHECK-LABEL: @test_tile_mmultf32ps(
// CHECK: call void @llvm.x86.tmmultf32ps(i8 1, i8 2, i8 3)
_tile_mmultf32ps(1, 2, 3);
}

void test_tile_tmmultf32ps(void) {
// CHECK-LABEL: @test_tile_tmmultf32ps(
// CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
_tile_tmmultf32ps(1, 2, 3);
}
27 changes: 27 additions & 0 deletions clang/test/CodeGen/X86/amx_tf32_api.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \
// RUN: -target-feature +amx-tf32 -target-feature +amx-transpose \
// RUN: -target-feature +amx-bf16 -target-feature +avx512f \
// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s

#include <immintrin.h>

char buf[1024];
#define STRIDE 32

char buf2[1024];

void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
//CHECK-LABEL: @test_tile_mmultf32ps
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
//CHECK-DAG: call x86_amx @llvm.x86.tmmultf32ps.internal
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
__tile_mmultf32ps(&c, a, b);
}

void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
//CHECK-LABEL: @test_tile_tmmultf32ps
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
//CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
__tile_tmmultf32ps(&c, a, b);
}
Loading
Loading