llvm · phoebewang · Nov 11, 2024 · Oct 26, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -740,6 +740,7 @@ X86 Support
 - Support ISA of ``AMX-FP8``.
 - Support ISA of ``AMX-TRANSPOSE``.
 - Support ISA of ``AMX-AVX512``.
+- Support ISA of ``AMX-TF32``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n",
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose")
+
 // AMX
 TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile")
 TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile")
@@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512")
 
-TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
-TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")
-
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
@@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
 
+// AMX TF32
+TARGET_BUILTIN(__builtin_ia32_tmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32")
+TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32,amx-transpose")
+
+TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi")
+TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd")
+TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
@@ -6297,6 +6297,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
 def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group<m_x86_Features_Group>;
 def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group<m_x86_Features_Group>;
+def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
+def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
 def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
@@ -434,6 +434,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXTRANSPOSE = true;
     } else if (Feature == "+amx-avx512") {
       HasAMXAVX512 = true;
+    } else if (Feature == "+amx-tf32") {
+      HasAMXTF32 = true;
     } else if (Feature == "+cmpccxadd") {
       HasCMPCCXADD = true;
     } else if (Feature == "+raoint") {
@@ -959,6 +961,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_TRANSPOSE__");
   if (HasAMXAVX512)
     Builder.defineMacro("__AMX_AVX512__");
+  if (HasAMXTF32)
+    Builder.defineMacro("__AMX_TF32__");
   if (HasCMPCCXADD)
     Builder.defineMacro("__CMPCCXADD__");
   if (HasRAOINT)
@@ -1090,6 +1094,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-fp16", true)
       .Case("amx-fp8", true)
       .Case("amx-int8", true)
+      .Case("amx-tf32", true)
       .Case("amx-tile", true)
       .Case("amx-transpose", true)
       .Case("avx", true)
@@ -1211,6 +1216,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-fp8", HasAMXFP8)
       .Case("amx-int8", HasAMXINT8)
+      .Case("amx-tf32", HasAMXTF32)
       .Case("amx-tile", HasAMXTILE)
       .Case("amx-transpose", HasAMXTRANSPOSE)
       .Case("avx", SSELevel >= AVX)

diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
@@ -160,6 +160,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXFP8 = false;
   bool HasAMXTRANSPOSE = false;
   bool HasAMXAVX512 = false;
+  bool HasAMXTF32 = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
   bool HasUSERMSR = false;

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
@@ -151,6 +151,8 @@ set(x86_files
   amxfp16intrin.h
   amxfp8intrin.h
   amxintrin.h
+  amxtf32intrin.h
+  amxtf32transposeintrin.h
   amxtransposeintrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h

diff --git a/clang/lib/Headers/amxtf32intrin.h b/clang/lib/Headers/amxtf32intrin.h
@@ -0,0 +1,108 @@
+/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_TF32INTRIN_H
+#define __AMX_TF32INTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TF32                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tf32")))
+
+/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus
+/// with \a srcdst.
+/// All the calculation is base on float32 but with the lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \
+///                       constexpr int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
+///
+/// \param srcdst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// 	The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// 	The 2nd source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
+///	dword[12:0] := 0
+///	dword[31:13] := x[31:13]
+///	return dword
+/// }
+///
+/// DEFINE silence_snan_fp32(x[31:0]) {
+/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
+/// 		x.fraction[22] := 1
+/// 	return x
+/// }
+///
+/// elements_a := a.colsb / 4
+/// elements_dest := srcdst.colsb / 4
+///
+/// FOR m = 0 TO (srcdst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR k = 0 TO (elements_a-1)
+/// 		FOR n = 0 TO (elements_dest-1)
+/// 			af := silence_snan_fp32(a.row[m].fp32[k])
+/// 			bf := silence_snan_fp32(b.row[k].fp32[n])
+/// 			tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af)
+/// 					* zero_lower_mantissa_bits_fp32(bf)
+/// 		ENDFOR
+/// 	ENDFOR
+///
+/// 	FOR n = 0 TO (elements_dest-1)
+/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
+/// 	ENDFOR
+///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
+///
+/// ENDFOR
+///
+/// zero_upper_rows(srcdst, srcdst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_mmultf32ps(srcdst, a, b)                                         \
+  __builtin_ia32_tmmultf32ps((srcdst), (a), (b))
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32
+_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                          _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst.
+/// All the calculation is base on float32 but with the lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TF32
+static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0,
+                              __tile1024i src1) {
+  dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                        src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_TF32INTRIN_H
diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h
@@ -0,0 +1,105 @@
+/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxtf32tranposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_TF32TRANSPOSEINTRIN_H
+#define __AMX_TF32TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE                                      \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-tf32,amx-transpose")))
+
+/// \code
+/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \
+///                        constexpr int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
+///
+/// \param srcdst
+/// 	The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// 	The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// 	The 2nd source tile. Max size is 1024 Bytes.
+///
+/// \code{.operation}
+/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) {
+/// 	dword[12:0] := 0
+/// 	dword[31:13] := x[31:13]
+/// 	return dword
+/// }
+///
+/// DEFINE silence_snan_fp32(x[31:0]) {
+/// 	IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0)
+/// 		x.fraction[22] := 1
+/// 	return x
+/// }
+///
+/// elements_dest:= srcdst.colsb/4
+///
+/// FOR m := 0 TO (srcdst.rows-1)
+/// 	tmp[511:0] := 0
+/// 	FOR k := 0 TO (a.rows-1)
+/// 		FOR n := 0 TO (elements_dest-1)
+/// 			a1e := silence_snan_fp32(a.row[k].fp32[m])
+/// 			a2e := silence_snan_fp32(b.row[k].fp32[n])
+/// 			s1e := zero_lower_mantissa_bits_fp32(a1e)
+/// 			s2e := zero_lower_mantissa_bits_fp32(a2e)
+/// 			tmp.fp32[n] += s1e * s2e
+/// 		ENDFOR
+/// 	ENDFOR
+///
+/// 	FOR n := 0 TO (elements_dest-1)
+/// 		tmp.fp32[n] += srcdst.row[m].fp32[n]
+/// 	ENDFOR
+///	write_row_and_zero(srcdst, m, tmp, srcdst.colsb)
+///
+/// ENDFOR
+///
+/// zero_upper_rows(srcdst, srcdst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+#define _tile_tmmultf32ps(srcdst, a, b)                                        \
+  __builtin_ia32_ttmmultf32ps((srcdst), (a), (b))
+
+// dst = m x n (srcdest), src1 = k x m, src2 = k x n
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE
+_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do
+/// Matrix Plus with dst. All the calculation is base on float32 but with the
+/// lower 13-bit set to 0.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TF32_TRANSPOSE
+static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_TF32TRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
@@ -660,6 +660,15 @@ _storebe_i64(void * __P, long long __D) {
 #include <amxavx512intrin.h>
 #endif
 
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__)
+#include <amxtf32intrin.h>
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__))
+#include <amxtf32transposeintrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) ||                             \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>

diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
@@ -654,6 +654,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tdpbhf8ps:
   case X86::BI__builtin_ia32_tdphbf8ps:
   case X86::BI__builtin_ia32_tdphf8ps:
+  case X86::BI__builtin_ia32_tmmultf32ps:
+  case X86::BI__builtin_ia32_ttmmultf32ps:
     return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
   case X86::BI__builtin_ia32_ttransposed:
     return CheckBuiltinTileArgumentsRange(TheCall, {0, 1});

diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-tf32 \
+// RUN: -target-feature +amx-transpose -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s
+
+#include <immintrin.h>
+#include <stddef.h>
+
+void test_tile_mmultf32ps(void) {
+  // CHECK-LABEL: @test_tile_mmultf32ps(
+  // CHECK: call void @llvm.x86.tmmultf32ps(i8 1, i8 2, i8 3)
+  _tile_mmultf32ps(1, 2, 3);
+}
+
+void test_tile_tmmultf32ps(void) {
+  // CHECK-LABEL: @test_tile_tmmultf32ps(
+  // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
+  _tile_tmmultf32ps(1, 2, 3);
+}
diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \
+// RUN: -target-feature +amx-tf32 -target-feature +amx-transpose  \
+// RUN: -target-feature +amx-bf16 -target-feature +avx512f \
+// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s
+
+#include <immintrin.h>
+
+char buf[1024];
+#define STRIDE 32
+
+char buf2[1024];
+
+void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_mmultf32ps
+  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+  //CHECK-DAG: call x86_amx @llvm.x86.tmmultf32ps.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile_mmultf32ps(&c, a, b);
+}
+
+void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) {
+  //CHECK-LABEL: @test_tile_tmmultf32ps
+  //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
+  //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal
+  //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}})
+  __tile_tmmultf32ps(&c, a, b);
+}