-
Notifications
You must be signed in to change notification settings - Fork 15k
[X86] Remove AMX-TRANSPOSE #165556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Remove AMX-TRANSPOSE #165556
Conversation
|
@llvm/pr-subscribers-mlgo @llvm/pr-subscribers-clang Author: Mikołaj Piróg (mikolaj-pirog) ChangesPer Intel Architecture Instruction Set Extensions Programming Reference rev. 59 (https://cdrdv2.intel.com/v1/dl/getContent/671368), Revision History entry for revision -59, AMX-TRANSPOSE was removed Patch is 526.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165556.diff 81 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index 275278c5ac089..062060e6afbbe 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -239,57 +239,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
def tcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
}
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1rs_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1t1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz1rst1_internal : X86Builtin<"void(unsigned short, unsigned short, unsigned short, _Vector<256, int *>, _Vector<256, int *>, void const *, size_t)">;
-}
-
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def ttransposed_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpbf16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
- def ttcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def ttcmmrlfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def tconjtcmmimfp16ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
- def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
-}
-
let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
@@ -303,10 +252,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
def tmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
}
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
- def ttmmultf32ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
-}
-
let Features = "amx-fp8", Attributes = [NoThrow] in {
def tdpbf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
def tdpbhf8ps_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, unsigned short, _Vector<256, int>, _Vector<256, int>, _Vector<256, int>)">;
@@ -321,13 +266,6 @@ let Features = "amx-tile", Attributes = [NoThrow] in {
def tilezero : X86Builtin<"void(unsigned char)">;
}
-let Features = "amx-movrs,amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz0rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1rs : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1rst1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
-}
-
let Features = "amx-movrs", Attributes = [NoThrow] in {
def tileloaddrs64 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
def tileloaddrst164 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
@@ -359,29 +297,6 @@ let Features = "amx-complex", Attributes = [NoThrow] in {
def tcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
}
-let Features = "amx-transpose", Attributes = [NoThrow] in {
- def t2rpntlvwz0 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz0t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def t2rpntlvwz1t1 : X86Builtin<"void(_Constant unsigned char, void const *, size_t)">;
- def ttransposed : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-bf16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpbf16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-fp16,amx-transpose", Attributes = [NoThrow] in {
- def ttdpfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
-let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
- def ttcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def ttcmmrlfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def tconjtcmmimfp16ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
- def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
-}
-
let Features = "amx-avx512,avx10.2", Attributes = [NoThrow] in {
def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
@@ -406,10 +321,6 @@ let Features = "amx-tf32", Attributes = [NoThrow] in {
def tmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
}
-let Features = "amx-tf32,amx-transpose", Attributes = [NoThrow] in {
- def ttmmultf32ps : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char, _Constant unsigned char)">;
-}
-
let Features = "prefetchi", Attributes = [NoThrow, Const] in {
def prefetchi : X86Builtin<"void(void const *, unsigned int)">;
}
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8784c9d7d206d..1d11db1209e47 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6695,8 +6695,6 @@ def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group<m_x86_Features_Group>;
def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group<m_x86_Features_Group>;
def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
-def mamx_transpose : Flag<["-"], "mamx-transpose">, Group<m_x86_Features_Group>;
-def mno_amx_transpose : Flag<["-"], "mno-amx-transpose">, Group<m_x86_Features_Group>;
def mamx_movrs: Flag<["-"], "mamx-movrs">, Group<m_x86_Features_Group>;
def mno_amx_movrs: Flag<["-"], "mno-amx-movrs">, Group<m_x86_Features_Group>;
def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index e71f10c4c16fc..7a90c89dd7dc0 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -396,8 +396,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXFP8 = true;
} else if (Feature == "+amx-movrs") {
HasAMXMOVRS = true;
- } else if (Feature == "+amx-transpose") {
- HasAMXTRANSPOSE = true;
} else if (Feature == "+amx-avx512") {
HasAMXAVX512 = true;
} else if (Feature == "+amx-tf32") {
@@ -925,8 +923,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMX_FP8__");
if (HasAMXMOVRS)
Builder.defineMacro("__AMX_MOVRS__");
- if (HasAMXTRANSPOSE)
- Builder.defineMacro("__AMX_TRANSPOSE__");
if (HasAMXAVX512)
Builder.defineMacro("__AMX_AVX512__");
if (HasAMXTF32)
@@ -1068,7 +1064,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("amx-movrs", true)
.Case("amx-tf32", true)
.Case("amx-tile", true)
- .Case("amx-transpose", true)
.Case("avx", true)
.Case("avx10.1", true)
.Case("avx10.2", true)
@@ -1189,7 +1184,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-movrs", HasAMXMOVRS)
.Case("amx-tf32", HasAMXTF32)
.Case("amx-tile", HasAMXTILE)
- .Case("amx-transpose", HasAMXTRANSPOSE)
.Case("avx", SSELevel >= AVX)
.Case("avx10.1", HasAVX10_1)
.Case("avx10.2", HasAVX10_2)
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index b924407b6ddd7..2381b2e7cf2cf 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2931,74 +2931,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// instruction, but it will create a memset that won't be optimized away.
return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
}
- // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
- case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
- case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
- Intrinsic::ID IID;
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
- IID = Intrinsic::x86_t2rpntlvwz0_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
- IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
- IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
- IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
- IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
- break;
- case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
- IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
- break;
- }
-
- // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
- Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
- {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
-
- auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
- assert(PtrTy && "arg3 must be of pointer type");
- QualType PtreeTy = PtrTy->getPointeeType();
- llvm::Type *TyPtee = ConvertType(PtreeTy);
-
- // Bitcast amx type (x86_amx) to vector type (256 x i32)
- // Then store tile0 into DstPtr0
- Value *T0 = Builder.CreateExtractValue(Call, 0);
- Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
- {TyPtee}, {T0});
- Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
-
- // Then store tile1 into DstPtr1
- Value *T1 = Builder.CreateExtractValue(Call, 1);
- Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
- {TyPtee}, {T1});
- Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
-
- // Note: Here we escape directly use x86_tilestored64_internal to store
- // the results due to it can't make sure the Mem written scope. This may
- // cause shapes reloads after first amx intrinsic, which current amx reg-
- // ister allocation has no ability to handle it.
-
- return Store;
- }
case X86::BI__ud2:
// llvm.trap makes a ud2a instruction on x86.
return EmitTrapCall(Intrinsic::trap);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 18589125697b0..33fff7645df65 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -162,18 +162,12 @@ set(x86_files
adxintrin.h
ammintrin.h
amxavx512intrin.h
- amxbf16transposeintrin.h
amxcomplexintrin.h
- amxcomplextransposeintrin.h
amxfp16intrin.h
- amxfp16transposeintrin.h
amxfp8intrin.h
amxintrin.h
amxmovrsintrin.h
- amxmovrstransposeintrin.h
amxtf32intrin.h
- amxtf32transposeintrin.h
- amxtransposeintrin.h
avx10_2_512bf16intrin.h
avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
deleted file mode 100644
index 86f09f2ad8db2..0000000000000
--- a/clang/lib/Headers/amxbf16transposeintrin.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMX_BF16TRANSPOSEINTRIN_H
-#define __AMX_BF16TRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-bf16,amx-transpose")))
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-/// tiles \a a and \a b, accumulating the intermediate single-precision
-/// (32-bit) floating-point elements with elements in \a dst, and store the
-/// 32-bit result back to tile \a dst.
-///
-/// \headerfile <immintrin.h>
-///
-/// \code
-/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
-/// \endcode
-///
-/// \code{.operation}
-/// FOR m := 0 TO dst.rows - 1
-/// tmp := dst.row[m]
-/// FOR k := 0 TO (a.colsb / 4) - 1
-/// FOR n := 0 TO (dst.colsb / 4) - 1
-/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
-/// FP32(b.row[k].bf16[2*n+0])
-/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
-/// FP32(b.row[k].bf16[2*n+1])
-/// ENDFOR
-/// ENDFOR
-/// write_row_and_zero(dst, m, tmp, dst.colsb)
-/// ENDFOR
-/// zero_upper_rows(dst, dst.rows)
-/// zero_tileconfig_start()
-/// \endcode
-///
-/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param a
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param b
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps((dst), (a), (b))
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS
-_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
-/// tiles src0 and src1, accumulating the intermediate single-precision
-/// (32-bit) floating-point elements with elements in "dst", and store the
-/// 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS
-static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __x86_64__ */
-#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
deleted file mode 100644
index 11abaf98e9371..0000000000000
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
-#endif // __IMMINTRIN_H
-
-#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
-#define __AMX_COMPLEXTRANSPOSEINTRIN_H
-#ifdef __x86_64__
-
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-complex,amx-transpose")))
-
-/// Perform matrix multiplication of two tiles containing complex elements and
-/// accumulate the results into a packed single precision tile. Each dword
-/// element in input tiles \a a and \a b is interpreted as a complex number
-/// with FP16 real part and FP16 imaginary part.
-/// Calculates the imaginary part of the result. For each possible combination
-/// of (transposed column of \a a, column of \a b), it performs a set of
-/// multiplication and accumulations on all corre...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| break; | ||
| case X86::PTCONJTCMMIMFP16PSV: | ||
| Opc = X86::TCONJTCMMIMFP16PS; | ||
| case X86::PTDPFP16PSV: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move it the same line to keep it as is. We may unify below code to the same format too.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I unified code below, I turned off clang-format for this block
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a few minors, otherwise LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks!
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/166/builds/3551 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/138/builds/21159 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/27971 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/28156 Here is the relevant piece of the build log for the reference |
Per Intel Architecture Instruction Set Extensions Programming Reference rev. 59 (https://cdrdv2.intel.com/v1/dl/getContent/671368), Revision History entry for revision -59, AMX-TRANSPOSE was removed