From 60fac68bef81335aa12e2faa7e364bb647a51872 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 14:21:51 +0200 Subject: [PATCH 01/14] [Clang][x86]: allow PCLMULQDQ intrinsics to be used in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 6 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 72 ++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 63 +++++++++++++++++ clang/test/CodeGen/X86/pclmul-builtins.c | 18 ++++- clang/test/CodeGen/X86/vpclmulqdq-builtins.c | 13 ++++ 5 files changed, 168 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index f6069fdc5707a..1eee50a441e31 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -444,15 +444,15 @@ let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } -let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "pclmul", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">; } -let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">; } -let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 83e40f64fd979..ef740c04c83da 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2745,6 +2745,73 @@ static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + // PCLMULQDQ: carry-less multiplication of selected 64-bit halves + // imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand + // imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand + assert(Call->getArg(0)->getType()->isVectorType() && + Call->getArg(1)->getType()->isVectorType()); + + // Extract imm8 argument + APSInt Imm8 = popToAPSInt(S, Call->getArg(2)); + unsigned Imm8Val = static_cast(Imm8.getZExtValue()); + bool SelectUpperA = (Imm8Val & 0x01) != 0; + bool SelectUpperB = (Imm8Val & 0x10) != 0; + + const Pointer &RHS = S.Stk.pop(); + const Pointer &LHS = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + const auto *VT = Call->getArg(0)->getType()->castAs(); + PrimType ElemT = *S.getContext().classify(VT->getElementType()); + unsigned NumElems = VT->getNumElements(); + const auto *DestVT = Call->getType()->castAs(); + PrimType DestElemT = *S.getContext().classify(DestVT->getElementType()); + bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); + + // Process each 128-bit lane (2 elements at a time) + for (unsigned Lane = 0; Lane < NumElems; Lane += 2) { + APSInt A0, A1, B0, B1; + INT_TYPE_SWITCH_NO_BOOL(ElemT, { + A0 = LHS.elem(Lane + 0).toAPSInt(); + A1 = LHS.elem(Lane + 1).toAPSInt(); + B0 = RHS.elem(Lane + 0).toAPSInt(); + B1 = RHS.elem(Lane + 1).toAPSInt(); + }); + + // Select the appropriate 64-bit values based on imm8 + APSInt A = SelectUpperA ? A1 : A0; + APSInt B = SelectUpperB ? B1 : B0; + + // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) + // This multiplies two 64-bit values to produce a 128-bit result + APInt AVal = A.getValue().zextOrTrunc(64); + APInt BVal = B.getValue().zextOrTrunc(64); + APInt Result(128, 0); + + // For each bit in A, if set, XOR B shifted left by that bit position + for (unsigned i = 0; i < 64; ++i) { + if (AVal[i]) { + APInt ShiftedB = BVal.zext(128) << i; + Result ^= ShiftedB; + } + } + + // Split the 128-bit result into two 64-bit halves + APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned); + APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned); + + INT_TYPE_SWITCH_NO_BOOL(DestElemT, { + Dst.elem(Lane + 0) = static_cast(ResultLow); + Dst.elem(Lane + 1) = static_cast(ResultHigh); + }); + } + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_elementwise_triop_fp( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_refgetArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + + APSInt Imm8; + if (!EvaluateInteger(E->getArg(2), Imm8, Info)) + return false; + + // Extract bits 0 and 4 from imm8 + unsigned Imm8Val = static_cast(Imm8.getZExtValue()); + bool SelectUpperA = (Imm8Val & 0x01) != 0; + bool SelectUpperB = (Imm8Val & 0x10) != 0; + + unsigned NumElems = SourceLHS.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(NumElems); + QualType DestEltTy = E->getType()->castAs()->getElementType(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + + // Process each 128-bit lane + for (unsigned Lane = 0; Lane < NumElems; Lane += 2) { + // Get the two 64-bit halves of the first operand + APSInt A0 = SourceLHS.getVectorElt(Lane + 0).getInt(); + APSInt A1 = SourceLHS.getVectorElt(Lane + 1).getInt(); + // Get the two 64-bit halves of the second operand + APSInt B0 = SourceRHS.getVectorElt(Lane + 0).getInt(); + APSInt B1 = SourceRHS.getVectorElt(Lane + 1).getInt(); + + // Select the appropriate 64-bit values based on imm8 + APSInt A = SelectUpperA ? A1 : A0; + APSInt B = SelectUpperB ? B1 : B0; + + // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) + // This multiplies two 64-bit values to produce a 128-bit result + APInt AVal = A.getValue().zextOrTrunc(64); + APInt BVal = B.getValue().zextOrTrunc(64); + APInt Result(128, 0); + + // For each bit in A, if set, XOR B shifted left by that bit position + for (unsigned i = 0; i < 64; ++i) { + if (AVal[i]) { + APInt ShiftedB = BVal.zext(128) << i; + Result ^= ShiftedB; + } + } + + // Split the 128-bit result into two 64-bit halves + APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned); + APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned); + + ResultElements.push_back(APValue(ResultLow)); + ResultElements.push_back(APValue(ResultHigh)); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } case Builtin::BI__builtin_elementwise_fshl: case Builtin::BI__builtin_elementwise_fshr: { APValue SourceHi, SourceLo, SourceShift; diff --git a/clang/test/CodeGen/X86/pclmul-builtins.c b/clang/test/CodeGen/X86/pclmul-builtins.c index 44300f645a9d0..b1e3cc5719d97 100644 --- a/clang/test/CodeGen/X86/pclmul-builtins.c +++ b/clang/test/CodeGen/X86/pclmul-builtins.c @@ -1,9 +1,25 @@ // RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s - +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s #include +#include "builtin_test_helpers.h" __m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) { // CHECK: @llvm.x86.pclmulqdq return _mm_clmulepi64_si128(a, b, 0); } + +// Test constexpr evaluation for _mm_clmulepi64_si128 +// imm8=0x00: lower 64 bits of both operands +// Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication) +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL)); + +// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x01), 0x3ULL, 0x0ULL)); + +// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x10), 0x3ULL, 0x0ULL)); + +// imm8=0x11: upper 64 bits of both operands +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x11), 0x3ULL, 0x0ULL)); diff --git a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c index aa2b8bca91268..e408e0556e380 100644 --- a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c +++ b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c @@ -1,17 +1,30 @@ // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - | FileCheck %s --check-prefix AVX // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512 +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefix AVX +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefixes AVX,AVX512 +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix AVX +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes AVX,AVX512 #include +#include "builtin_test_helpers.h" __m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) { // AVX: @llvm.x86.pclmulqdq.256 return _mm256_clmulepi64_epi128(A, B, 0); } +// Test constexpr evaluation for _mm256_clmulepi64_epi128 +// Each 128-bit lane is processed independently +TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}, (__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL)); + #ifdef __AVX512F__ __m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) { // AVX512: @llvm.x86.pclmulqdq.512 return _mm512_clmulepi64_epi128(A, B, 0); } + +// Test constexpr evaluation for _mm512_clmulepi64_epi128 +// Each 128-bit lane is processed independently +TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}, (__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL)); #endif From 12b605b944716fbe9fa9ad8075fdf2672098d7af Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 14:28:48 +0200 Subject: [PATCH 02/14] chore: Format files --- clang/include/clang/Basic/BuiltinsX86.td | 9 ++++++--- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- clang/lib/AST/ExprConstant.cpp | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 1eee50a441e31..ecc05974adecb 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -444,15 +444,18 @@ let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } -let Features = "pclmul", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { +let Features = "pclmul", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">; } -let Features = "vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { +let Features = "vpclmulqdq", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">; } -let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { +let Features = "avx512f,vpclmulqdq", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index ef740c04c83da..83a61f496a3ec 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2746,13 +2746,13 @@ static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC, } static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, - const CallExpr *Call) { + const CallExpr *Call) { // PCLMULQDQ: carry-less multiplication of selected 64-bit halves // imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand // imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand assert(Call->getArg(0)->getType()->isVectorType() && Call->getArg(1)->getType()->isVectorType()); - + // Extract imm8 argument APSInt Imm8 = popToAPSInt(S, Call->getArg(2)); unsigned Imm8Val = static_cast(Imm8.getZExtValue()); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ea4a7c320a3f2..fbd3701c784d3 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13522,8 +13522,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { APSInt A = SelectUpperA ? A1 : A0; APSInt B = SelectUpperB ? B1 : B0; - // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) - // This multiplies two 64-bit values to produce a 128-bit result + // Perform carry-less multiplication (polynomial multiplication in + // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result APInt AVal = A.getValue().zextOrTrunc(64); APInt BVal = B.getValue().zextOrTrunc(64); APInt Result(128, 0); From 2f80098af72c5758151f3c60104b4e3ead925266 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 15:30:51 +0200 Subject: [PATCH 03/14] refactor: PR Feedback --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 5 ++--- clang/lib/AST/ExprConstant.cpp | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 83a61f496a3ec..6a71ff9c01586 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2755,9 +2755,8 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // Extract imm8 argument APSInt Imm8 = popToAPSInt(S, Call->getArg(2)); - unsigned Imm8Val = static_cast(Imm8.getZExtValue()); - bool SelectUpperA = (Imm8Val & 0x01) != 0; - bool SelectUpperB = (Imm8Val & 0x10) != 0; + bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0; + bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0; const Pointer &RHS = S.Stk.pop(); const Pointer &LHS = S.Stk.pop(); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index fbd3701c784d3..ed309bdd3e377 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13499,9 +13499,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; // Extract bits 0 and 4 from imm8 - unsigned Imm8Val = static_cast(Imm8.getZExtValue()); - bool SelectUpperA = (Imm8Val & 0x01) != 0; - bool SelectUpperB = (Imm8Val & 0x10) != 0; + bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0; + bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0; unsigned NumElems = SourceLHS.getVectorLength(); SmallVector ResultElements; From e3de2515f40786672c2a260a2ea290903f499a0f Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 15:46:02 +0200 Subject: [PATCH 04/14] refactor: simplify conversion from APSInt to APInt --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- clang/lib/AST/ExprConstant.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 6a71ff9c01586..be0b560814ebd 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2785,8 +2785,8 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) // This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.getValue().zextOrTrunc(64); - APInt BVal = B.getValue().zextOrTrunc(64); + APInt AVal = A.extOrTrunc(64); + APInt BVal = B.extOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ed309bdd3e377..22f042d515ac5 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13523,8 +13523,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { // Perform carry-less multiplication (polynomial multiplication in // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.getValue().zextOrTrunc(64); - APInt BVal = B.getValue().zextOrTrunc(64); + APInt AVal = A.extOrTrunc(64); + APInt BVal = B.extOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position From 5afa1b171ccd84eaa4935e429160696086b158ae Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 15:47:31 +0200 Subject: [PATCH 05/14] refactor: Use static casting --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- clang/lib/AST/ExprConstant.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index be0b560814ebd..c692e32cdefc8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2785,8 +2785,8 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) // This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.extOrTrunc(64); - APInt BVal = B.extOrTrunc(64); + APInt AVal = static_cast(A).zextOrTrunc(64); + APInt BVal = static_cast(B).zextOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 22f042d515ac5..03ee822b57143 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13523,8 +13523,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { // Perform carry-less multiplication (polynomial multiplication in // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.extOrTrunc(64); - APInt BVal = B.extOrTrunc(64); + APInt AVal = static_cast(A).zextOrTrunc(64); + APInt BVal = static_cast(B).zextOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position From f5c5f23d16a9e4412e55ba8766e9d02f4184d5fa Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 16:24:02 +0200 Subject: [PATCH 06/14] refactor: update tests --- clang/test/CodeGen/X86/pclmul-builtins.c | 8 ++++---- clang/test/CodeGen/X86/vpclmulqdq-builtins.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/test/CodeGen/X86/pclmul-builtins.c b/clang/test/CodeGen/X86/pclmul-builtins.c index b1e3cc5719d97..5af4014b0f663 100644 --- a/clang/test/CodeGen/X86/pclmul-builtins.c +++ b/clang/test/CodeGen/X86/pclmul-builtins.c @@ -13,13 +13,13 @@ __m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) { // Test constexpr evaluation for _mm_clmulepi64_si128 // imm8=0x00: lower 64 bits of both operands // Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication) -TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL)); +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((__m128i){0x3ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL)); // imm8=0x01: upper 64 bits of first operand, lower 64 bits of second -TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x3ULL, 0x0ULL}, 0x01), 0x3ULL, 0x0ULL)); +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, 0x1ULL}), ((__m128i){0x3ULL, 0x0ULL}), 0x01), 0x3ULL, 0x0ULL)); // imm8=0x10: lower 64 bits of first operand, upper 64 bits of second -TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x1ULL, 0x0ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x10), 0x3ULL, 0x0ULL)); +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((__m128i){0x0ULL, 0x3ULL}), 0x10), 0x3ULL, 0x0ULL)); // imm8=0x11: upper 64 bits of both operands -TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128((__m128i){0x0ULL, 0x1ULL}, (__m128i){0x0ULL, 0x3ULL}, 0x11), 0x3ULL, 0x0ULL)); +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, 0x1ULL}), ((__m128i){0x0ULL, 0x3ULL}), 0x11), 0x3ULL, 0x0ULL)); diff --git a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c index e408e0556e380..24b5594518009 100644 --- a/clang/test/CodeGen/X86/vpclmulqdq-builtins.c +++ b/clang/test/CodeGen/X86/vpclmulqdq-builtins.c @@ -15,7 +15,7 @@ __m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) { // Test constexpr evaluation for _mm256_clmulepi64_epi128 // Each 128-bit lane is processed independently -TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}, (__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL)); +TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128(((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}), ((__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL)); #ifdef __AVX512F__ __m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) { @@ -25,6 +25,6 @@ __m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) { // Test constexpr evaluation for _mm512_clmulepi64_epi128 // Each 128-bit lane is processed independently -TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}, (__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}, 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL)); +TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128(((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}), ((__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL)); #endif From 0cee75e569754d6813d5dc20281109a8c2292b17 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 16:30:50 +0200 Subject: [PATCH 07/14] Update InterpBuiltin.cpp --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index c692e32cdefc8..106b67565ba51 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2755,8 +2755,8 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // Extract imm8 argument APSInt Imm8 = popToAPSInt(S, Call->getArg(2)); - bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0; - bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0; + bool SelectUpperA = (Imm8 & 0x01) != 0; + bool SelectUpperB = (Imm8 & 0x10) != 0; const Pointer &RHS = S.Stk.pop(); const Pointer &LHS = S.Stk.pop(); From 974ddac3a7c98f37d78875dd9c55450d9a434581 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 16:32:30 +0200 Subject: [PATCH 08/14] Update ExprConstant.cpp --- clang/lib/AST/ExprConstant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 03ee822b57143..e34c8b6f119d4 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13499,8 +13499,8 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; // Extract bits 0 and 4 from imm8 - bool SelectUpperA = (Imm8.getZExtValue() & 0x01) != 0; - bool SelectUpperB = (Imm8.getZExtValue() & 0x10) != 0; + bool SelectUpperA = (Imm8 & 0x01) != 0; + bool SelectUpperB = (Imm8 & 0x10) != 0; unsigned NumElems = SourceLHS.getVectorLength(); SmallVector ResultElements; From 7a2832377d285dc68675f3f9f177c4c69981b562 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 16:40:34 +0200 Subject: [PATCH 09/14] refactor: Use APInt instead of APSInt --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 8 ++++---- clang/lib/AST/ExprConstant.cpp | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 106b67565ba51..710bbb5267079 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2780,13 +2780,13 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, }); // Select the appropriate 64-bit values based on imm8 - APSInt A = SelectUpperA ? A1 : A0; - APSInt B = SelectUpperB ? B1 : B0; + APInt A = SelectUpperA ? A1 : A0; + APInt B = SelectUpperB ? B1 : B0; // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) // This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = static_cast(A).zextOrTrunc(64); - APInt BVal = static_cast(B).zextOrTrunc(64); + APInt AVal = A.zextOrTrunc(64);; + APInt BVal = B.zextOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index e34c8b6f119d4..b237af5d357aa 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13518,13 +13518,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { APSInt B1 = SourceRHS.getVectorElt(Lane + 1).getInt(); // Select the appropriate 64-bit values based on imm8 - APSInt A = SelectUpperA ? A1 : A0; - APSInt B = SelectUpperB ? B1 : B0; + APInt A = SelectUpperA ? A1 : A0; + APInt B = SelectUpperB ? B1 : B0; // Perform carry-less multiplication (polynomial multiplication in - // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = static_cast(A).zextOrTrunc(64); - APInt BVal = static_cast(B).zextOrTrunc(64); + // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result + APInt AVal = A.zextOrTrunc(64);; + APInt BVal = B.zextOrTrunc(64); APInt Result(128, 0); // For each bit in A, if set, XOR B shifted left by that bit position From 5c7eb8e9f8661e5f123badf01487b038473db24d Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 16:42:04 +0200 Subject: [PATCH 10/14] feat: update formatting --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- clang/lib/AST/ExprConstant.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 710bbb5267079..9032709b7ac6f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2785,7 +2785,7 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) // This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.zextOrTrunc(64);; + APInt AVal = A.zextOrTrunc(64); APInt BVal = B.zextOrTrunc(64); APInt Result(128, 0); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b237af5d357aa..7bbebfead8320 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13522,8 +13522,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { APInt B = SelectUpperB ? B1 : B0; // Perform carry-less multiplication (polynomial multiplication in - // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.zextOrTrunc(64);; + // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result + APInt AVal = A.zextOrTrunc(64); + ; APInt BVal = B.zextOrTrunc(64); APInt Result(128, 0); From 3a72489d1a6881c3dad96c4f09a4b1cfcd042b73 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 20:21:48 +0200 Subject: [PATCH 11/14] feat: Use APIntOps::clmul --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 16 +++++++--------- clang/lib/AST/ExprConstant.cpp | 17 +++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 9032709b7ac6f..cdfaa5bf89aa5 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2787,15 +2787,13 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // This multiplies two 64-bit values to produce a 128-bit result APInt AVal = A.zextOrTrunc(64); APInt BVal = B.zextOrTrunc(64); - APInt Result(128, 0); - - // For each bit in A, if set, XOR B shifted left by that bit position - for (unsigned i = 0; i < 64; ++i) { - if (AVal[i]) { - APInt ShiftedB = BVal.zext(128) << i; - Result ^= ShiftedB; - } - } + + // Extend both operands to 128 bits for carry-less multiplication + APInt A128 = AVal.zext(128); + APInt B128 = BVal.zext(128); + + // Use APIntOps::clmul for carry-less multiplication + APInt Result = llvm::APIntOps::clmul(A128, B128); // Split the 128-bit result into two 64-bit halves APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7bbebfead8320..dcf189163e347 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13524,17 +13524,14 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { // Perform carry-less multiplication (polynomial multiplication in // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result APInt AVal = A.zextOrTrunc(64); - ; APInt BVal = B.zextOrTrunc(64); - APInt Result(128, 0); - - // For each bit in A, if set, XOR B shifted left by that bit position - for (unsigned i = 0; i < 64; ++i) { - if (AVal[i]) { - APInt ShiftedB = BVal.zext(128) << i; - Result ^= ShiftedB; - } - } + + // Extend both operands to 128 bits for carry-less multiplication + APInt A128 = AVal.zext(128); + APInt B128 = BVal.zext(128); + + // Use APIntOps::clmul for carry-less multiplication + APInt Result = llvm::APIntOps::clmul(A128, B128); // Split the 128-bit result into two 64-bit halves APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned); From 5c486a92c486275b1b57936cb3a911bd155ced66 Mon Sep 17 00:00:00 2001 From: ahmed Date: Sun, 23 Nov 2025 20:22:08 +0200 Subject: [PATCH 12/14] chore: format files --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- clang/lib/AST/ExprConstant.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index cdfaa5bf89aa5..ccf471263ae60 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2787,11 +2787,11 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, // This multiplies two 64-bit values to produce a 128-bit result APInt AVal = A.zextOrTrunc(64); APInt BVal = B.zextOrTrunc(64); - + // Extend both operands to 128 bits for carry-less multiplication APInt A128 = AVal.zext(128); APInt B128 = BVal.zext(128); - + // Use APIntOps::clmul for carry-less multiplication APInt Result = llvm::APIntOps::clmul(A128, B128); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index dcf189163e347..6d4c195f4a0c9 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13525,11 +13525,11 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result APInt AVal = A.zextOrTrunc(64); APInt BVal = B.zextOrTrunc(64); - + // Extend both operands to 128 bits for carry-less multiplication APInt A128 = AVal.zext(128); APInt B128 = BVal.zext(128); - + // Use APIntOps::clmul for carry-less multiplication APInt Result = llvm::APIntOps::clmul(A128, B128); From ec1331c7557a466b80b69bea6c776482b8eccc96 Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 24 Nov 2025 13:36:01 +0200 Subject: [PATCH 13/14] refactor: remove unused conversion --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 9 ++------- clang/lib/AST/ExprConstant.cpp | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index ccf471263ae60..096ef37d5bb01 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2783,14 +2783,9 @@ static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC, APInt A = SelectUpperA ? A1 : A0; APInt B = SelectUpperB ? B1 : B0; - // Perform carry-less multiplication (polynomial multiplication in GF(2^64)) - // This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.zextOrTrunc(64); - APInt BVal = B.zextOrTrunc(64); - // Extend both operands to 128 bits for carry-less multiplication - APInt A128 = AVal.zext(128); - APInt B128 = BVal.zext(128); + APInt A128 = A.zext(128); + APInt B128 = B.zext(128); // Use APIntOps::clmul for carry-less multiplication APInt Result = llvm::APIntOps::clmul(A128, B128); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 6d4c195f4a0c9..c0d7832f480d8 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13521,14 +13521,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { APInt A = SelectUpperA ? A1 : A0; APInt B = SelectUpperB ? B1 : B0; - // Perform carry-less multiplication (polynomial multiplication in - // GF(2^64)) This multiplies two 64-bit values to produce a 128-bit result - APInt AVal = A.zextOrTrunc(64); - APInt BVal = B.zextOrTrunc(64); - // Extend both operands to 128 bits for carry-less multiplication - APInt A128 = AVal.zext(128); - APInt B128 = BVal.zext(128); + APInt A128 = A.zext(128); + APInt B128 = B.zext(128); // Use APIntOps::clmul for carry-less multiplication APInt Result = llvm::APIntOps::clmul(A128, B128); From 931bcc5320a53870df7b30bddeff132eea77a5ef Mon Sep 17 00:00:00 2001 From: ahmed Date: Mon, 24 Nov 2025 13:40:51 +0200 Subject: [PATCH 14/14] feat: add upper bits testing --- clang/test/CodeGen/X86/pclmul-builtins.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/clang/test/CodeGen/X86/pclmul-builtins.c b/clang/test/CodeGen/X86/pclmul-builtins.c index 5af4014b0f663..ee8e05e4cf2e5 100644 --- a/clang/test/CodeGen/X86/pclmul-builtins.c +++ b/clang/test/CodeGen/X86/pclmul-builtins.c @@ -23,3 +23,20 @@ TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((_ // imm8=0x11: upper 64 bits of both operands TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, 0x1ULL}), ((__m128i){0x0ULL, 0x3ULL}), 0x11), 0x3ULL, 0x0ULL)); + +// Test cases with non-zero upper 64-bit results +// imm8=0x00: lower 64 bits of both operands +// 0x8000000000000000 * 0x2 = result with upper bits set +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){(long long)0x8000000000000000ULL, 0x0ULL}), ((__m128i){0x2ULL, 0x0ULL}), 0x00), 0x0ULL, 0x1ULL)); + +// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second +// 0xFFFFFFFFFFFFFFFF * 0x2 = result with upper bits set +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, (long long)0xFFFFFFFFFFFFFFFFULL}), ((__m128i){0x2ULL, 0x0ULL}), 0x01), 0xFFFFFFFFFFFFFFFEULL, 0x1ULL)); + +// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second +// 0x1000000000000000 * 0x10 = result with upper bits set +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){(long long)0x1000000000000000ULL, 0x0ULL}), ((__m128i){0x0ULL, 0x10ULL}), 0x10), 0x0ULL, 0x1ULL)); + +// imm8=0x11: upper 64 bits of both operands +// 0x8000000000000001 * 0x8000000000000001 = result with upper bits set +TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, (long long)0x8000000000000001ULL}), ((__m128i){0x0ULL, (long long)0x8000000000000001ULL}), 0x11), 0x1ULL, 0x4000000000000000ULL));