Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,18 @@ let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth
def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "pclmul", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "pclmul",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pclmulqdq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Constant char)">;
}

let Features = "vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "vpclmulqdq",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pclmulqdq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant char)">;
}

let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512f,vpclmulqdq",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
}

Expand Down
64 changes: 64 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2745,6 +2745,65 @@ static bool interp__builtin_ia32_addsub(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_pclmulqdq(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
// PCLMULQDQ: carry-less multiplication of selected 64-bit halves
// imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
// imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());

// Extract imm8 argument
APSInt Imm8 = popToAPSInt(S, Call->getArg(2));
bool SelectUpperA = (Imm8 & 0x01) != 0;
bool SelectUpperB = (Imm8 & 0x10) != 0;

const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VT->getElementType());
unsigned NumElems = VT->getNumElements();
const auto *DestVT = Call->getType()->castAs<VectorType>();
PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();

// Process each 128-bit lane (2 elements at a time)
for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
APSInt A0, A1, B0, B1;
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
A0 = LHS.elem<T>(Lane + 0).toAPSInt();
A1 = LHS.elem<T>(Lane + 1).toAPSInt();
B0 = RHS.elem<T>(Lane + 0).toAPSInt();
B1 = RHS.elem<T>(Lane + 1).toAPSInt();
});

// Select the appropriate 64-bit values based on imm8
APInt A = SelectUpperA ? A1 : A0;
APInt B = SelectUpperB ? B1 : B0;

// Extend both operands to 128 bits for carry-less multiplication
APInt A128 = A.zext(128);
APInt B128 = B.zext(128);

// Use APIntOps::clmul for carry-less multiplication
APInt Result = llvm::APIntOps::clmul(A128, B128);

// Split the 128-bit result into two 64-bit halves
APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);

INT_TYPE_SWITCH_NO_BOOL(DestElemT, {
Dst.elem<T>(Lane + 0) = static_cast<T>(ResultLow);
Dst.elem<T>(Lane + 1) = static_cast<T>(ResultHigh);
});
}

Dst.initializeAllElements();
return true;
}

static bool interp__builtin_elementwise_triop_fp(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
Expand Down Expand Up @@ -4366,6 +4425,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return llvm::APIntOps::muluExtended(LoLHS, LoRHS);
});

case clang::X86::BI__builtin_ia32_pclmulqdq128:
case clang::X86::BI__builtin_ia32_pclmulqdq256:
case clang::X86::BI__builtin_ia32_pclmulqdq512:
return interp__builtin_ia32_pclmulqdq(S, OpPC, Call);

case Builtin::BI__builtin_elementwise_fma:
return interp__builtin_elementwise_triop_fp(
S, OpPC, Call,
Expand Down
55 changes: 55 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13483,6 +13483,61 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case clang::X86::BI__builtin_ia32_pclmulqdq128:
case clang::X86::BI__builtin_ia32_pclmulqdq256:
case clang::X86::BI__builtin_ia32_pclmulqdq512: {
// PCLMULQDQ: carry-less multiplication of selected 64-bit halves
// imm8 bit 0: selects lower (0) or upper (1) 64 bits of first operand
// imm8 bit 4: selects lower (0) or upper (1) 64 bits of second operand
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;

APSInt Imm8;
if (!EvaluateInteger(E->getArg(2), Imm8, Info))
return false;

// Extract bits 0 and 4 from imm8
bool SelectUpperA = (Imm8 & 0x01) != 0;
bool SelectUpperB = (Imm8 & 0x10) != 0;

unsigned NumElems = SourceLHS.getVectorLength();
SmallVector<APValue, 8> ResultElements;
ResultElements.reserve(NumElems);
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();

// Process each 128-bit lane
for (unsigned Lane = 0; Lane < NumElems; Lane += 2) {
// Get the two 64-bit halves of the first operand
APSInt A0 = SourceLHS.getVectorElt(Lane + 0).getInt();
APSInt A1 = SourceLHS.getVectorElt(Lane + 1).getInt();
// Get the two 64-bit halves of the second operand
APSInt B0 = SourceRHS.getVectorElt(Lane + 0).getInt();
APSInt B1 = SourceRHS.getVectorElt(Lane + 1).getInt();

// Select the appropriate 64-bit values based on imm8
APInt A = SelectUpperA ? A1 : A0;
APInt B = SelectUpperB ? B1 : B0;

// Extend both operands to 128 bits for carry-less multiplication
APInt A128 = A.zext(128);
APInt B128 = B.zext(128);

// Use APIntOps::clmul for carry-less multiplication
APInt Result = llvm::APIntOps::clmul(A128, B128);

// Split the 128-bit result into two 64-bit halves
APSInt ResultLow(Result.extractBits(64, 0), DestUnsigned);
APSInt ResultHigh(Result.extractBits(64, 64), DestUnsigned);

ResultElements.push_back(APValue(ResultLow));
ResultElements.push_back(APValue(ResultHigh));
}

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case Builtin::BI__builtin_elementwise_fshl:
case Builtin::BI__builtin_elementwise_fshr: {
APValue SourceHi, SourceLo, SourceShift;
Expand Down
35 changes: 34 additions & 1 deletion clang/test/CodeGen/X86/pclmul-builtins.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,42 @@
// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - | FileCheck %s

// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 | FileCheck %s
// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +pclmul -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s

#include <wmmintrin.h>
#include "builtin_test_helpers.h"

__m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) {
// CHECK: @llvm.x86.pclmulqdq
return _mm_clmulepi64_si128(a, b, 0);
}

// Test constexpr evaluation for _mm_clmulepi64_si128
// imm8=0x00: lower 64 bits of both operands
// Test case: 0x1 * 0x3 = 0x3 (carry-less multiplication)
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((__m128i){0x3ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, 0x1ULL}), ((__m128i){0x3ULL, 0x0ULL}), 0x01), 0x3ULL, 0x0ULL));

// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x1ULL, 0x0ULL}), ((__m128i){0x0ULL, 0x3ULL}), 0x10), 0x3ULL, 0x0ULL));

// imm8=0x11: upper 64 bits of both operands
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, 0x1ULL}), ((__m128i){0x0ULL, 0x3ULL}), 0x11), 0x3ULL, 0x0ULL));

// Test cases with non-zero upper 64-bit results
// imm8=0x00: lower 64 bits of both operands
// 0x8000000000000000 * 0x2 = result with upper bits set
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){(long long)0x8000000000000000ULL, 0x0ULL}), ((__m128i){0x2ULL, 0x0ULL}), 0x00), 0x0ULL, 0x1ULL));

// imm8=0x01: upper 64 bits of first operand, lower 64 bits of second
// 0xFFFFFFFFFFFFFFFF * 0x2 = result with upper bits set
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, (long long)0xFFFFFFFFFFFFFFFFULL}), ((__m128i){0x2ULL, 0x0ULL}), 0x01), 0xFFFFFFFFFFFFFFFEULL, 0x1ULL));

// imm8=0x10: lower 64 bits of first operand, upper 64 bits of second
// 0x1000000000000000 * 0x10 = result with upper bits set
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){(long long)0x1000000000000000ULL, 0x0ULL}), ((__m128i){0x0ULL, 0x10ULL}), 0x10), 0x0ULL, 0x1ULL));

// imm8=0x11: upper 64 bits of both operands
// 0x8000000000000001 * 0x8000000000000001 = result with upper bits set
TEST_CONSTEXPR(match_m128i(_mm_clmulepi64_si128(((__m128i){0x0ULL, (long long)0x8000000000000001ULL}), ((__m128i){0x0ULL, (long long)0x8000000000000001ULL}), 0x11), 0x1ULL, 0x4000000000000000ULL));
13 changes: 13 additions & 0 deletions clang/test/CodeGen/X86/vpclmulqdq-builtins.c
Original file line number Diff line number Diff line change
@@ -1,17 +1,30 @@
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - | FileCheck %s --check-prefixes AVX,AVX512
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 | FileCheck %s --check-prefixes AVX,AVX512
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix AVX
// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +vpclmulqdq -target-feature +avx512f -emit-llvm -o - -std=c++11 -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes AVX,AVX512

#include <immintrin.h>
#include "builtin_test_helpers.h"

__m256i test_mm256_clmulepi64_epi128(__m256i A, __m256i B) {
// AVX: @llvm.x86.pclmulqdq.256
return _mm256_clmulepi64_epi128(A, B, 0);
}

// Test constexpr evaluation for _mm256_clmulepi64_epi128
// Each 128-bit lane is processed independently
TEST_CONSTEXPR(match_m256i(_mm256_clmulepi64_epi128(((__m256i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL}), ((__m256i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to see some complex values - not just some simple cases - we need to be certain that the implementation is complete - have you done any fuzz testing comparing constexpr vs runtime ?


#ifdef __AVX512F__
__m512i test_mm512_clmulepi64_epi128(__m512i A, __m512i B) {
// AVX512: @llvm.x86.pclmulqdq.512
return _mm512_clmulepi64_epi128(A, B, 0);
}

// Test constexpr evaluation for _mm512_clmulepi64_epi128
// Each 128-bit lane is processed independently
TEST_CONSTEXPR(match_m512i(_mm512_clmulepi64_epi128(((__m512i){0x1ULL, 0x0ULL, 0x2ULL, 0x0ULL, 0x4ULL, 0x0ULL, 0x8ULL, 0x0ULL}), ((__m512i){0x3ULL, 0x0ULL, 0x5ULL, 0x0ULL, 0x7ULL, 0x0ULL, 0x9ULL, 0x0ULL}), 0x00), 0x3ULL, 0x0ULL, 0xaULL, 0x0ULL, 0x1cULL, 0x0ULL, 0x48ULL, 0x0ULL));
#endif