-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86][Clang] Add AVX512 kunpck intrinsics to be used in constexp #167683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Add AVX-512 mask unpack intrinsics _mm512_kunpackd and _mm512_kunpackw to avx512fintrin.h alongside the existing _mm512_kunpackb intrinsic. These intrinsics extract and concatenate the lower halves of mask registers, using the existing backend support for __builtin_ia32_kunpckdi and __builtin_ia32_kunpcksi builtins. Also adds __mmask32 and __mmask64 type definitions to avx512fintrin.h for completeness. Tests already exist in clang/test/CodeGen/X86/avx512bw-builtins.c.
…expr support Add AVX-512 mask unpack intrinsics _mm512_kunpackd and _mm512_kunpackw to avx512fintrin.h alongside the existing _mm512_kunpackb intrinsic. These intrinsics extract and concatenate the lower halves of mask registers, using the existing backend support for __builtin_ia32_kunpckdi and __builtin_ia32_kunpcksi builtins. Also adds __mmask32 and __mmask64 type definitions to avx512fintrin.h for completeness. This patch adds constexpr support for all three kunpack intrinsics by: 1. Using __DEFAULT_FN_ATTRS_CONSTEXPR attribute 2. Adding builtin interpretation in ExprConstant.cpp for compile-time evaluation in constexpr contexts 3. Adding constexpr tests to verify correct behavior Tests already exist in clang/test/CodeGen/X86/avx512bw-builtins.c for runtime code generation validation.
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Ahmed Nour (ahmednoursphinx) ChangesResolves #166976 Full diff: https://github.com/llvm/llvm-project/pull/167683.diff 4 Files Affected:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 1bfea24b228e8..a0a1d0ce2a94b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16287,6 +16287,42 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
return Success((A | B) == 0, E);
}
+ case clang::X86::BI__builtin_ia32_kunpckhi: {
+ APSInt A, B;
+ if (!EvaluateInteger(E->getArg(0), A, Info) ||
+ !EvaluateInteger(E->getArg(1), B, Info))
+ return false;
+
+ // Extract lower 8 bits of each operand and concatenate
+ // Result = (A[7:0] << 8) | B[7:0]
+ APSInt Result = ((A & 0xFF) << 8) | (B & 0xFF);
+ return Success(Result, E);
+ }
+
+ case clang::X86::BI__builtin_ia32_kunpckdi: {
+ APSInt A, B;
+ if (!EvaluateInteger(E->getArg(0), A, Info) ||
+ !EvaluateInteger(E->getArg(1), B, Info))
+ return false;
+
+ // Extract lower 32 bits of each operand and concatenate
+ // Result = (A[31:0] << 32) | B[31:0]
+ APSInt Result = ((A & 0xFFFFFFFFULL) << 32) | (B & 0xFFFFFFFFULL);
+ return Success(Result, E);
+ }
+
+ case clang::X86::BI__builtin_ia32_kunpcksi: {
+ APSInt A, B;
+ if (!EvaluateInteger(E->getArg(0), A, Info) ||
+ !EvaluateInteger(E->getArg(1), B, Info))
+ return false;
+
+ // Extract lower 16 bits of each operand and concatenate
+ // Result = (A[15:0] << 16) | B[15:0]
+ APSInt Result = ((A & 0xFFFF) << 16) | (B & 0xFFFF);
+ return Success(Result, E);
+ }
+
case clang::X86::BI__builtin_ia32_lzcnt_u16:
case clang::X86::BI__builtin_ia32_lzcnt_u32:
case clang::X86::BI__builtin_ia32_lzcnt_u64: {
@@ -16413,6 +16449,21 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
return Success(APValue(Result), E);
}
+ case X86::BI__builtin_ia32_kunpckhi:
+ case X86::BI__builtin_ia32_kunpcksi:
+ case X86::BI__builtin_ia32_kunpckdi: {
+ return HandleMaskBinOp([](const APSInt &LHS, const APSInt &RHS) {
+ // Unpack: concatenate lower half of RHS with lower half of LHS
+ unsigned HalfBits = LHS.getBitWidth() / 2;
+ APSInt Mask = APSInt::getMaxValue(LHS.getBitWidth(), LHS.isUnsigned());
+ Mask = Mask.trunc(HalfBits).zext(LHS.getBitWidth());
+
+ APSInt LowerLHS = LHS & Mask;
+ APSInt LowerRHS = RHS & Mask;
+ return LowerRHS | (LowerLHS << HalfBits);
+ });
+ }
+
case X86::BI__builtin_ia32_kaddqi:
case X86::BI__builtin_ia32_kaddhi:
case X86::BI__builtin_ia32_kaddsi:
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 4a02c96620335..d247f648b9eb5 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1606,15 +1606,14 @@ _mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
(__v64qi) _mm512_setzero_si512());
}
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A,
- __mmask64 __B) {
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
(__mmask64) __B);
}
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
-{
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
(__mmask32) __B);
}
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 997e9608e112f..badc30a7eb26c 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -40,6 +40,8 @@ typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)))
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
@@ -8094,12 +8096,21 @@ _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
}
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
-{
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
}
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+ return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+ return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B);
+}
+
static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 17778b52d3671..9dcc749910175 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9126,6 +9126,24 @@ __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D
__E, __F);
}
+TEST_CONSTEXPR(_mm512_kunpackb(0xFF00, 0x00FF) == 0xFF00);
+TEST_CONSTEXPR(_mm512_kunpackb(0xABCD, 0x1234) == 0xCD34);
+TEST_CONSTEXPR(_mm512_kunpackb(0x00FF, 0xFF00) == 0x0000);
+TEST_CONSTEXPR(_mm512_kunpackb(0xAAAA, 0x5555) == 0xAA55);
+TEST_CONSTEXPR(_mm512_kunpackb(0x1234, 0xABCD) == 0x34CD);
+
+TEST_CONSTEXPR(_mm512_kunpackw(0xFFFF0000u, 0x0000FFFFu) == 0x0000FFFFu);
+TEST_CONSTEXPR(_mm512_kunpackw(0xABCD1234u, 0x56789ABCu) == 0x12349ABCu);
+TEST_CONSTEXPR(_mm512_kunpackw(0x0000FFFFu, 0xFFFF0000u) == 0x00000000u);
+TEST_CONSTEXPR(_mm512_kunpackw(0xAAAA5555u, 0x5555AAAAu) == 0x5555AAAAu);
+TEST_CONSTEXPR(_mm512_kunpackw(0x12345678u, 0xABCDEF12u) == 0x5678EF12u);
+
+TEST_CONSTEXPR(_mm512_kunpackd(0xFFFFFFFF00000000ull, 0x00000000FFFFFFFFull) == 0x00000000FFFFFFFFull);
+TEST_CONSTEXPR(_mm512_kunpackd(0xABCDEF0123456789ull, 0x0123456789ABCDEFull) == 0x234567899ABCDEFull);
+TEST_CONSTEXPR(_mm512_kunpackd(0x00000000FFFFFFFFull, 0xFFFFFFFF00000000ull) == 0x0000000000000000ull);
+TEST_CONSTEXPR(_mm512_kunpackd(0xAAAA5555AAAA5555ull, 0x5555AAAA5555AAAAull) == 0xAAAA55555555AAAAull);
+TEST_CONSTEXPR(_mm512_kunpackd(0x123456789ABCDEFull, 0xFEDCBA9876543210ull) == 0x89ABCDEF76543210ull);
+
__mmask16 test_mm512_kxnor(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
// CHECK-LABEL: test_mm512_kxnor
// CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
|
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still missing InterpBuiltin and BuiltinsX86.td fixes
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/174/builds/27581 Here is the relevant piece of the build log for the reference |
Resolves #166976