-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[X86][AMX] Support AMX-TF32 #115625
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
[X86][AMX] Support AMX-TF32 #115625
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------=== | ||
* | ||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
* See https://llvm.org/LICENSE.txt for license information. | ||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
* | ||
*===------------------------------------------------------------------------=== | ||
*/ | ||
|
||
#ifndef __IMMINTRIN_H | ||
#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead." | ||
#endif // __IMMINTRIN_H | ||
|
||
#ifndef __AMX_TF32INTRIN_H | ||
#define __AMX_TF32INTRIN_H | ||
#ifdef __x86_64__ | ||
|
||
#define __DEFAULT_FN_ATTRS_TF32 \ | ||
__attribute__((__always_inline__, __nodebug__, __target__("amx-tf32"))) | ||
|
||
/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus | ||
/// with \a srcdst. | ||
/// All the calculation is base on float32 but with the lower 13-bit set to 0. | ||
/// | ||
/// \headerfile <immintrin.h> | ||
/// | ||
/// \code | ||
/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \ | ||
/// constexpr int b); | ||
/// \endcode | ||
/// | ||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction. | ||
/// | ||
/// \param srcdst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param a | ||
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
/// | ||
/// \code{.operation} | ||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { | ||
/// dword[12:0] := 0 | ||
/// dword[31:13] := x[31:13] | ||
/// return dword | ||
/// } | ||
/// | ||
/// DEFINE silence_snan_fp32(x[31:0]) { | ||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) | ||
/// x.fraction[22] := 1 | ||
/// return x | ||
/// } | ||
/// | ||
/// elements_a := a.colsb / 4 | ||
/// elements_dest := srcdst.colsb / 4 | ||
/// | ||
/// FOR m = 0 TO (srcdst.rows-1) | ||
/// tmp[511:0] := 0 | ||
/// FOR k = 0 TO (elements_a-1) | ||
/// FOR n = 0 TO (elements_dest-1) | ||
/// af := silence_snan_fp32(a.row[m].fp32[k]) | ||
/// bf := silence_snan_fp32(b.row[k].fp32[n]) | ||
/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af) | ||
/// * zero_lower_mantissa_bits_fp32(bf) | ||
/// ENDFOR | ||
/// ENDFOR | ||
/// | ||
/// FOR n = 0 TO (elements_dest-1) | ||
/// tmp.fp32[n] += srcdst.row[m].fp32[n] | ||
/// ENDFOR | ||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) | ||
/// | ||
/// ENDFOR | ||
/// | ||
/// zero_upper_rows(srcdst, srcdst.rows) | ||
/// zero_tileconfig_start() | ||
/// \endcode | ||
#define _tile_mmultf32ps(srcdst, a, b) \ | ||
__builtin_ia32_tmmultf32ps((srcdst), (a), (b)) | ||
|
||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32 | ||
_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, | ||
_tile1024i dst, _tile1024i src1, _tile1024i src2) { | ||
return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2); | ||
} | ||
|
||
/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst. | ||
/// All the calculation is base on float32 but with the lower 13-bit set to 0. | ||
/// | ||
/// \headerfile <immintrin.h> | ||
/// | ||
/// This intrinsic corresponds to the <c> TMMULTF32PS </c> instruction. | ||
/// | ||
/// \param dst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param src0 | ||
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param src1 | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
__DEFAULT_FN_ATTRS_TF32 | ||
static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0, | ||
__tile1024i src1) { | ||
dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile, | ||
src0.tile, src1.tile); | ||
} | ||
|
||
#endif // __x86_64__ | ||
#endif // __AMX_TF32INTRIN_H |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== | ||
* | ||
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
* See https://llvm.org/LICENSE.txt for license information. | ||
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
* | ||
*===------------------------------------------------------------------------=== | ||
*/ | ||
#ifndef __IMMINTRIN_H | ||
#error \ | ||
"Never use <amxtf32tranposeintrin.h> directly; include <immintrin.h> instead." | ||
#endif // __IMMINTRIN_H | ||
|
||
#ifndef __AMX_TF32TRANSPOSEINTRIN_H | ||
#define __AMX_TF32TRANSPOSEINTRIN_H | ||
#ifdef __x86_64__ | ||
|
||
#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ | ||
__attribute__((__always_inline__, __nodebug__, \ | ||
__target__("amx-tf32,amx-transpose"))) | ||
|
||
/// \code | ||
/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ | ||
/// constexpr int b); | ||
/// \endcode | ||
/// | ||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. | ||
/// | ||
/// \param srcdst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param a | ||
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
/// | ||
/// \code{.operation} | ||
/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { | ||
/// dword[12:0] := 0 | ||
/// dword[31:13] := x[31:13] | ||
/// return dword | ||
/// } | ||
/// | ||
/// DEFINE silence_snan_fp32(x[31:0]) { | ||
/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) | ||
/// x.fraction[22] := 1 | ||
/// return x | ||
/// } | ||
/// | ||
/// elements_dest:= srcdst.colsb/4 | ||
/// | ||
/// FOR m := 0 TO (srcdst.rows-1) | ||
/// tmp[511:0] := 0 | ||
/// FOR k := 0 TO (a.rows-1) | ||
/// FOR n := 0 TO (elements_dest-1) | ||
/// a1e := silence_snan_fp32(a.row[k].fp32[m]) | ||
/// a2e := silence_snan_fp32(b.row[k].fp32[n]) | ||
/// s1e := zero_lower_mantissa_bits_fp32(a1e) | ||
/// s2e := zero_lower_mantissa_bits_fp32(a2e) | ||
/// tmp.fp32[n] += s1e * s2e | ||
/// ENDFOR | ||
/// ENDFOR | ||
/// | ||
/// FOR n := 0 TO (elements_dest-1) | ||
/// tmp.fp32[n] += srcdst.row[m].fp32[n] | ||
/// ENDFOR | ||
/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) | ||
/// | ||
/// ENDFOR | ||
/// | ||
/// zero_upper_rows(srcdst, srcdst.rows) | ||
/// zero_tileconfig_start() | ||
/// \endcode | ||
#define _tile_tmmultf32ps(srcdst, a, b) \ | ||
__builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) | ||
|
||
// dst = m x n (srcdest), src1 = k x m, src2 = k x n | ||
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE | ||
_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, | ||
_tile1024i dst, _tile1024i src1, _tile1024i src2) { | ||
return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); | ||
} | ||
|
||
/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do | ||
/// Matrix Plus with dst. All the calculation is base on float32 but with the | ||
/// lower 13-bit set to 0. | ||
/// | ||
/// \headerfile <immintrin.h> | ||
/// | ||
/// This intrinsic corresponds to the <c> TTMMULTF32PS </c> instruction. | ||
/// | ||
/// \param dst | ||
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param src0 | ||
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param src1 | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
__DEFAULT_FN_ATTRS_TF32_TRANSPOSE | ||
static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, | ||
__tile1024i src1) { | ||
dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, | ||
dst->tile, src0.tile, src1.tile); | ||
} | ||
|
||
#endif // __x86_64__ | ||
#endif // __AMX_TF32TRANSPOSEINTRIN_H |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -660,6 +660,15 @@ _storebe_i64(void * __P, long long __D) { | |
#include <amxavx512intrin.h> | ||
#endif | ||
|
||
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__) | ||
#include <amxtf32intrin.h> | ||
#endif | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
#if !defined(__SCE__) || __has_feature(modules) || \ | ||
(defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__)) | ||
#include <amxtf32transposeintrin.h> | ||
#endif | ||
|
||
#if !defined(__SCE__) || __has_feature(modules) || \ | ||
defined(__AVX512VP2INTERSECT__) | ||
#include <avx512vp2intersectintrin.h> | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-tf32 \ | ||
// RUN: -target-feature +amx-transpose -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s | ||
|
||
#include <immintrin.h> | ||
#include <stddef.h> | ||
|
||
void test_tile_mmultf32ps(void) { | ||
// CHECK-LABEL: @test_tile_mmultf32ps( | ||
// CHECK: call void @llvm.x86.tmmultf32ps(i8 1, i8 2, i8 3) | ||
_tile_mmultf32ps(1, 2, 3); | ||
} | ||
|
||
void test_tile_tmmultf32ps(void) { | ||
// CHECK-LABEL: @test_tile_tmmultf32ps( | ||
// CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) | ||
_tile_tmmultf32ps(1, 2, 3); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \ | ||
// RUN: -target-feature +amx-tf32 -target-feature +amx-transpose \ | ||
// RUN: -target-feature +amx-bf16 -target-feature +avx512f \ | ||
// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s | ||
|
||
#include <immintrin.h> | ||
|
||
char buf[1024]; | ||
#define STRIDE 32 | ||
|
||
char buf2[1024]; | ||
|
||
void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { | ||
//CHECK-LABEL: @test_tile_mmultf32ps | ||
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
//CHECK-DAG: call x86_amx @llvm.x86.tmmultf32ps.internal | ||
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) | ||
__tile_mmultf32ps(&c, a, b); | ||
} | ||
|
||
void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { | ||
//CHECK-LABEL: @test_tile_tmmultf32ps | ||
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) | ||
//CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal | ||
//CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) | ||
__tile_tmmultf32ps(&c, a, b); | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing the new file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry. Forgot to add it. Done. Thanks.