-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Clang][x86]: allow PCLMULQDQ intrinsics to be used in constexpr #169214
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Ahmed Nour (ahmednoursphinx) ChangesResolves #168741 Full diff: https://github.com/llvm/llvm-project/pull/169214.diff 5 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index f6069fdc5707a..1eee50a441e31 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -444,15 +444,15 @@ let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth
def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
-let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "pclmul", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
}
-let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
}
-let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 83e40f64fd979..ef740c04c83da 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2745,6 +2745,73 @@ static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call) {
+ // PCLMULQDQ: carry-less multiplication of selected 64-bit halves
+ // imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
+ // imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
+ assert(Call->getArg(0)->getType()->isVectorType() &&
+ Call->getArg(1)->getType()->isVectorType());
+
+ // Extract imm8 argument
+ APSInt Imm8 = popToAPSInt(S, Call->getArg(2));
+ unsigned Imm8Val = static_cast<unsigned>(Imm8.getZExtValue());
+ bool SelectUpperA = (Imm8Val & 0x01) != 0;
+ bool SelectUpperB = (Imm8Val & 0x10) != 0;
+
+ const Pointer &RHS = S.Stk.pop<Pointer>();
+ const Pointer &LHS = S.Stk.pop<Pointer>();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+ PrimType ElemT = *S.getContext().classify(VT->getElementType());
+ unsigned NumElems = VT->getNumElements();
+ const auto *DestVT = Call->getType()->castAs<VectorType>();
+ PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
+ bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+ // Process each 128-bit lane (2 elements at a time)
+ for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
+ APSInt A0, A1, B0, B1;
+ INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+ A0 = LHS.elem<T>(Lane + 0).toAPSInt();
+ A1 = LHS.elem<T>(Lane + 1).toAPSInt();
+ B0 = RHS.elem<T>(Lane + 0).toAPSInt();
+ B1 = RHS.elem<T>(Lane + 1).toAPSInt();
+ });
+
+ // Select the appropriate 64-bit values based on imm8
+ APSInt A = SelectUpperA ? A1 : A0;
+ APSInt B = SelectUpperB ? B1 : B0;
+
+ // Perform carry-less multiplication (polynomial multiplication in GF(2^64))
+ // This multiplies two 64-bit values to produce a 128-bit result
+ APInt AVal = A.getValue().zextOrTrunc(64);
+ APInt BVal = B.getValue().zextOrTrunc(64);
+ APInt Result(128, 0);
+
+ // For each bit in A, if set, XOR B shifted left by that bit position
+ for (unsigned i = 0; i < 64; ++i) {
+ if (AVal[i]) {
+ APInt ShiftedB = BVal.zext(128) << i;
+ Result ^= ShiftedB;
+ }
+ }
+
+ // Split the 128-bit result into two 64-bit halves
+ APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
+ APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);
+
+ INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
+ Dst.elem<T>(Lane + 0) = static_cast<T>(ResultLow);
+ Dst.elem<T>(Lane + 1) = static_cast<T>(ResultHigh);
+ });
+ }
+
+ Dst.initializeAllElements();
+ return true;
+}
+
static bool interp__builtin_elementwise_triop_fp(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -4366,6 +4433,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return llvm::APIntOps::muluExtended(LoLHS, LoRHS);
});
+ case clang::X86::BI__builtin_ia32_pclmulqdq128:
+ case clang::X86::BI__builtin_ia32_pclmulqdq256:
+ case clang::X86::BI__builtin_ia32_pclmulqdq512:
+ return interp__builtin_ia32_pclmulqdq(S, OpPC, Call);
+
case Builtin::BI__builtin_elementwise_fma:
return interp__builtin_elementwise_triop_fp(
S, OpPC, Call,
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3b91678f7d400..ea4a7c320a3f2 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13483,6 +13483,69 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case clang::X86::BI__builtin_ia32_pclmulqdq128:
+ case clang::X86::BI__builtin_ia32_pclmulqdq256:
+ case clang::X86::BI__builtin_ia32_pclmulqdq512: {
+ // PCLMULQDQ: carry-less multiplication of selected 64-bit halves
+ // imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
+ // imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
+ APValue SourceLHS, SourceRHS;
+ if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+ !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+ return false;
+
+ APSInt Imm8;
+ if (!EvaluateInteger(E->getArg(2), Imm8, Info))
+ return false;
+
+ // Extract bits 0 and 4 from imm8
+ unsigned Imm8Val = static_cast<unsigned>(Imm8.getZExtValue());
+ bool SelectUpperA = (Imm8Val & 0x01) != 0;
+ bool SelectUpperB = (Imm8Val & 0x10) != 0;
+
+ unsigned NumElems = SourceLHS.getVectorLength();
+ SmallVector<APValue, 8> ResultElements;
+ ResultElements.reserve(NumElems);
+ QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+ bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+
+ // Process each 128-bit lane
+ for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
+ // Get the two 64-bit halves of the first operand
+ APSInt A0 = SourceLHS.getVectorElt(Lane + 0).getInt();
+ APSInt A1 = SourceLHS.getVectorElt(Lane + 1).getInt();
+ // Get the two 64-bit halves of the second operand
+ APSInt B0 = SourceRHS.getVectorElt(Lane + 0).getInt();
+ APSInt B1 = SourceRHS.getVectorElt(Lane + 1).getInt();
+
+ // Select the appropriate 64-bit values based on imm8
+ APSInt A = SelectUpperA ? A1 : A0;
+ APSInt B = SelectUpperB ? B1 : B0;
+
+ // Perform carry-less multiplication (polynomial multiplication in GF(2^64))
+ // This multiplies two 64-bit values to produce a 128-bit result
+ APInt AVal = A.getValue().zextOrTrunc(64);
+ APInt BVal = B.getValue().zextOrTrunc(64);
+ APInt Result(128, 0);
+
+ // For each bit in A, if set, XOR B shifted left by that bit position
+ for (unsigned i = 0; i < 64; ++i) {
+ if (AVal[i]) {
+ APInt ShiftedB = BVal.zext(128) << i;
+ Result ^= ShiftedB;
+ }
+ }
+
+ // Split the 128-bit result into two 64-bit halves
+ APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
+ APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);
+
+ ResultElements.push_back(APValue(ResultLow));
+ ResultElements.push_back(APValue(ResultHigh));
+ }
+
+ return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+ }
case Builtin::BI__builtin_elementwise_fshl:
case Builtin::BI__builtin_elementwise_fshr: {
APValue SourceHi, SourceLo, SourceShift;
diff --git a/clang/test/CodeGen/X86/pclmul-builtins.c b/clang/test/CodeGen/X86/pclmul-builtins.c
index 44300f645a9d0..b1e3cc5719d97 100644
--- a/clang/test/CodeGen/X86/pclmul-builtins.c
+++ b/clang/test/CodeGen/X86/pclmul-builtins.c
@@ -1,9 +1,25 @@
// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s
-
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s
#include <wmmintrin.h>
+#include "builtin_test_helpers.h"
__m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) {
// CHECK: @llvm.x86.pclmulqdq
return _mm_clmulepi64_si128(a, b, 0);
}
+
+// Test constexpr evaluation for _mm_clmulepi64_si128
+// imm8=0x00: lower 64 bits of both operands
+// Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication)
+TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL));
+
+// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second
+TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x01), 0x3ULL, 0x0ULL));
+
+// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second
+TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x10), 0x3ULL, 0x0ULL));
+
+// imm8=0x11: upper 64 bits of both operands
+TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x11), 0x3ULL, 0x0ULL));
diff --git a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c
index aa2b8bca91268..e408e0556e380 100644
--- a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c
+++ b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c
@@ -1,17 +1,30 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefix AVX
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefixes AVX,AVX512
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix AVX
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes AVX,AVX512
#include <immintrin.h>
+#include "builtin_test_helpers.h"
__m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) {
// AVX: @llvm.x86.pclmulqdq.256
return _mm256_clmulepi64_epi128(A, B, 0);
}
+// Test constexpr evaluation for _mm256_clmulepi64_epi128
+// Each 128-bit lane is processed independently
+TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}, (__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL));
+
#ifdef __AVX512F__
__m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) {
// AVX512: @llvm.x86.pclmulqdq.512
return _mm512_clmulepi64_epi128(A, B, 0);
}
+
+// Test constexpr evaluation for _mm512_clmulepi64_epi128
+// Each 128-bit lane is processed independently
+TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}, (__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL));
#endif
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
🐧 Linux x64 Test Results
|
| APInt AVal = A.getValue().zextOrTrunc(64); | ||
| APInt BVal = B.getValue().zextOrTrunc(64); | ||
| APInt AVal = A.extOrTrunc(64); | ||
| APInt BVal = B.extOrTrunc(64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe make A + B both APInt and still use zextOrTrunc - I never trust APSInt signedness....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great idea , updated it
|
Hey @RKSimon PR is ready for review again when you have time |
| // Test constexpr evaluation for _mm_clmulepi64_si128 | ||
| // imm8=0x00: lower 64 bits of both operands | ||
| // Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication) | ||
| TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((__m128i){0x3ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need tests showing results in the upper 64-bits as well:
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=clmul&ig_expand=754
|
Hey @RKSimon PR is ready for review again when you have some time |
|
|
||
| // Test constexpr evaluation for _mm256_clmulepi64_epi128 | ||
| // Each 128-bit lane is processed independently | ||
| TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128(((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}), ((__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer to see some complex values - not just some simple cases - we need to be certain that the implementation is complete - have you done any fuzz testing comparing constexpr vs runtime ?
Resolves #168741