Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 7 additions & 25 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -402,39 +402,21 @@ let Features = "avx512f,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth
def aesdeclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
}

let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vgf2p8affineinvqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
}

let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
}

let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
}

let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vgf2p8affineqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
}

let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
}

let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
}

let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vgf2p8mulb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}

let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "avx,gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
def vgf2p8mulb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
}

let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512f,gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

Expand Down
109 changes: 109 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3735,6 +3735,100 @@ static bool interp__builtin_ia32_multishiftqb(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp_builtin_ia32_gfni_affine(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
bool Inverse) {
assert(Call->getNumArgs() == 3);
QualType XType = Call->getArg(0)->getType();
QualType AType = Call->getArg(1)->getType();
QualType ImmType = Call->getArg(2)->getType();
if (!XType->isVectorType() || !AType->isVectorType() ||
!ImmType->isIntegerType()) {
return false;
}

Pointer X, A;
APSInt Imm = popToAPSInt(S, Call->getArg(2));
A = S.Stk.pop<Pointer>();
X = S.Stk.pop<Pointer>();

const Pointer &Dst = S.Stk.peek<Pointer>();
const auto *XVecT = XType->castAs<VectorType>();
const auto *AVecT = AType->castAs<VectorType>();
assert(XVecT->getNumElements() == AVecT->getNumElements());
unsigned NumBytesInQWord = 8;
unsigned NumBytes = AVecT->getNumElements();
unsigned NumBitsInQWord = 64;
unsigned NumQWords = NumBytes / NumBytesInQWord;
unsigned NumBitsInByte = 8;
PrimType AElemT = *S.getContext().classify(AVecT->getElementType());

// computing A*X + Imm
for (unsigned QWordIdx = 0; QWordIdx != NumQWords; ++QWordIdx) {
// Extract the QWords from X, A
APInt XQWord(NumBitsInQWord, 0);
APInt AQWord(NumBitsInQWord, 0);
for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
uint8_t XByte;
uint8_t AByte;
INT_TYPE_SWITCH(AElemT, {
XByte = static_cast<uint8_t>(X.elem<T>(Idx));
AByte = static_cast<uint8_t>(A.elem<T>(Idx));
});

XQWord.insertBits(APInt(NumBitsInByte, XByte), ByteIdx * NumBitsInByte);
AQWord.insertBits(APInt(NumBitsInByte, AByte), ByteIdx * NumBitsInByte);
}

for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
uint8_t XByte =
XQWord.lshr(ByteIdx * NumBitsInByte).getLoBits(8).getZExtValue();
INT_TYPE_SWITCH(AElemT, {
Dst.elem<T>(Idx) = T::from(GFNIAffine(XByte, AQWord, Imm, Inverse));
});
}
}
Dst.initializeAllElements();
return true;
}

static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
assert(Call->getNumArgs() == 2);

QualType AType = Call->getArg(0)->getType();
QualType BType = Call->getArg(1)->getType();
if (!AType->isVectorType() || !BType->isVectorType()) {
return false;
}

Pointer A, B;
B = S.Stk.pop<Pointer>();
A = S.Stk.pop<Pointer>();

const Pointer &Dst = S.Stk.peek<Pointer>();
const auto *AVecT = AType->castAs<VectorType>();
const auto *BVecT = BType->castAs<VectorType>();
assert(AVecT->getNumElements() == BVecT->getNumElements());

PrimType AElemT = *S.getContext().classify(AVecT->getElementType());
unsigned NumBytes = A.getNumElems();

for (unsigned ByteIdx = 0; ByteIdx != NumBytes; ++ByteIdx) {
uint8_t AByte, BByte;
INT_TYPE_SWITCH(AElemT, {
AByte = static_cast<uint8_t>(A.elem<T>(ByteIdx));
BByte = static_cast<uint8_t>(B.elem<T>(ByteIdx));
Dst.elem<T>(ByteIdx) = T::from(GFNIMul(AByte, BByte));
});
}

Dst.initializeAllElements();
return true;
}

bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
Expand Down Expand Up @@ -4749,6 +4843,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});

case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi:
return interp_builtin_ia32_gfni_affine(S, OpPC, Call, true);
case X86::BI__builtin_ia32_vgf2p8affineqb_v16qi:
case X86::BI__builtin_ia32_vgf2p8affineqb_v32qi:
case X86::BI__builtin_ia32_vgf2p8affineqb_v64qi:
return interp_builtin_ia32_gfni_affine(S, OpPC, Call, false);

case X86::BI__builtin_ia32_vgf2p8mulb_v16qi:
case X86::BI__builtin_ia32_vgf2p8mulb_v32qi:
case X86::BI__builtin_ia32_vgf2p8mulb_v64qi:
return interp__builtin_ia32_gfni_mul(S, OpPC, Call);

case X86::BI__builtin_ia32_insertps128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/AST/ExprConstShared.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
#define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H

#include "clang/Basic/TypeTraits.h"
#include <cstdint>

namespace llvm {
class APFloat;
class APInt;
class APSInt;
}
namespace clang {
class QualType;
Expand Down Expand Up @@ -74,4 +77,9 @@ void HandleComplexComplexDiv(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C,
CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E,
UnaryExprOrTypeTrait ExprKind);

uint8_t GFNIMultiplicativeInverse(uint8_t Byte);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we be better off putting these in APIntOps or MathExtras ? @tbaederr - thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that needed anywhere else? In regular codegen maybe?

Copy link
Collaborator

@RKSimon RKSimon Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

x86 (middleend/backend) might end up with some additional gfni folds at some point, I don't know if any other targets have an identical instruction

Copy link
Contributor Author

@chaitanyav chaitanyav Nov 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please let me know the appropriate place for the helper functions, i will move them to the preferred location.

uint8_t GFNIMul(uint8_t AByte, uint8_t BByte);
uint8_t GFNIAffine(uint8_t XByte, const llvm::APInt &AQword,
const llvm::APSInt Imm, bool Inverse = false);

#endif
165 changes: 165 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13712,6 +13712,89 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(R, E);
}

case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi:
case X86::BI__builtin_ia32_vgf2p8affineqb_v16qi:
case X86::BI__builtin_ia32_vgf2p8affineqb_v32qi:
case X86::BI__builtin_ia32_vgf2p8affineqb_v64qi: {

APValue X, A;
APSInt Imm;
if (!EvaluateAsRValue(Info, E->getArg(0), X) ||
!EvaluateAsRValue(Info, E->getArg(1), A) ||
!EvaluateInteger(E->getArg(2), Imm, Info))
return false;

assert(X.isVector() && A.isVector());
assert(X.getVectorLength() == A.getVectorLength());

bool IsInverse = false;
switch (E->getBuiltinCallee()) {
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi: {
IsInverse = true;
}
}

unsigned NumBitsInByte = 8;
unsigned NumBytesInQWord = 8;
unsigned NumBitsInQWord = 64;
unsigned NumBytes = A.getVectorLength();
unsigned NumQWords = NumBytes / NumBytesInQWord;
SmallVector<APValue, 64> Result;
Result.reserve(NumBytes);

// computing A*X + Imm
for (unsigned QWordIdx = 0; QWordIdx != NumQWords; ++QWordIdx) {
// Extract the QWords from X, A
APInt XQWord(NumBitsInQWord, 0);
APInt AQWord(NumBitsInQWord, 0);
for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
APInt XByte = X.getVectorElt(Idx).getInt();
APInt AByte = A.getVectorElt(Idx).getInt();
XQWord.insertBits(XByte, ByteIdx * NumBitsInByte);
AQWord.insertBits(AByte, ByteIdx * NumBitsInByte);
}

for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
uint8_t XByte =
XQWord.lshr(ByteIdx * NumBitsInByte).getLoBits(8).getZExtValue();
Result.push_back(APValue(APSInt(
APInt(8, GFNIAffine(XByte, AQWord, Imm, IsInverse)), false)));
}
}

return Success(APValue(Result.data(), Result.size()), E);
}

case X86::BI__builtin_ia32_vgf2p8mulb_v16qi:
case X86::BI__builtin_ia32_vgf2p8mulb_v32qi:
case X86::BI__builtin_ia32_vgf2p8mulb_v64qi: {
APValue A, B;
if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
!EvaluateAsRValue(Info, E->getArg(1), B))
return false;

assert(A.isVector() && B.isVector());
assert(A.getVectorLength() == B.getVectorLength());

unsigned NumBytes = A.getVectorLength();
SmallVector<APValue, 64> Result;
Result.reserve(NumBytes);

for (unsigned ByteIdx = 0; ByteIdx != NumBytes; ++ByteIdx) {
uint8_t AByte = A.getVectorElt(ByteIdx).getInt().getZExtValue();
uint8_t BByte = B.getVectorElt(ByteIdx).getInt().getZExtValue();
Result.push_back(APValue(
APSInt(APInt(8, GFNIMul(AByte, BByte)), /*IsUnsigned=*/false)));
}

return Success(APValue(Result.data(), Result.size()), E);
}

case X86::BI__builtin_ia32_insertf32x4_256:
case X86::BI__builtin_ia32_inserti32x4_256:
case X86::BI__builtin_ia32_insertf64x2_256:
Expand Down Expand Up @@ -19278,6 +19361,88 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) {
llvm_unreachable("unknown cast resulting in complex value");
}

uint8_t GFNIMultiplicativeInverse(uint8_t Byte) {
// Lookup Table for Multiplicative Inverse in GF(2^8)
const uint8_t GFInv[256] = {
0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0,
0xb0, 0xe1, 0xe5, 0xc7, 0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f,
0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2, 0x3a, 0x6e, 0x5a, 0xf1,
0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,
0x2c, 0x45, 0x92, 0x6c, 0xf3, 0x39, 0x66, 0x42, 0xf2, 0x35, 0x20, 0x6f,
0x77, 0xbb, 0x59, 0x19, 0x1d, 0xfe, 0x37, 0x67, 0x2d, 0x31, 0xf5, 0x69,
0xa7, 0x64, 0xab, 0x13, 0x54, 0x25, 0xe9, 0x09, 0xed, 0x5c, 0x05, 0xca,
0x4c, 0x24, 0x87, 0xbf, 0x18, 0x3e, 0x22, 0xf0, 0x51, 0xec, 0x61, 0x17,
0x16, 0x5e, 0xaf, 0xd3, 0x49, 0xa6, 0x36, 0x43, 0xf4, 0x47, 0x91, 0xdf,
0x33, 0x93, 0x21, 0x3b, 0x79, 0xb7, 0x97, 0x85, 0x10, 0xb5, 0xba, 0x3c,
0xb6, 0x70, 0xd0, 0x06, 0xa1, 0xfa, 0x81, 0x82, 0x83, 0x7e, 0x7f, 0x80,
0x96, 0x73, 0xbe, 0x56, 0x9b, 0x9e, 0x95, 0xd9, 0xf7, 0x02, 0xb9, 0xa4,
0xde, 0x6a, 0x32, 0x6d, 0xd8, 0x8a, 0x84, 0x72, 0x2a, 0x14, 0x9f, 0x88,
0xf9, 0xdc, 0x89, 0x9a, 0xfb, 0x7c, 0x2e, 0xc3, 0x8f, 0xb8, 0x65, 0x48,
0x26, 0xc8, 0x12, 0x4a, 0xce, 0xe7, 0xd2, 0x62, 0x0c, 0xe0, 0x1f, 0xef,
0x11, 0x75, 0x78, 0x71, 0xa5, 0x8e, 0x76, 0x3d, 0xbd, 0xbc, 0x86, 0x57,
0x0b, 0x28, 0x2f, 0xa3, 0xda, 0xd4, 0xe4, 0x0f, 0xa9, 0x27, 0x53, 0x04,
0x1b, 0xfc, 0xac, 0xe6, 0x7a, 0x07, 0xae, 0x63, 0xc5, 0xdb, 0xe2, 0xea,
0x94, 0x8b, 0xc4, 0xd5, 0x9d, 0xf8, 0x90, 0x6b, 0xb1, 0x0d, 0xd6, 0xeb,
0xc6, 0x0e, 0xcf, 0xad, 0x08, 0x4e, 0xd7, 0xe3, 0x5d, 0x50, 0x1e, 0xb3,
0x5b, 0x23, 0x38, 0x34, 0x68, 0x46, 0x03, 0x8c, 0xdd, 0x9c, 0x7d, 0xa0,
0xcd, 0x1a, 0x41, 0x1c};

return GFInv[Byte];
}

uint8_t GFNIAffine(uint8_t XByte, const APInt &AQword, const APSInt Imm,
bool Inverse) {
unsigned NumBitsInByte = 8;
// Computing the affine transformation
uint8_t RetByte = 0;
for (uint32_t BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) {
uint8_t AByte =
AQword.lshr((7 - static_cast<int32_t>(BitIdx)) * NumBitsInByte)
.getLoBits(8)
.getZExtValue();
uint8_t Product;
if (Inverse) {
Product = AByte & GFNIMultiplicativeInverse(XByte);
} else {
Product = AByte & XByte;
}
uint8_t Parity = 0;

// Dot product in GF(2) uses XOR instead of addition
for (unsigned PBitIdx = 0; PBitIdx != NumBitsInByte; ++PBitIdx) {
Parity = Parity ^ ((Product >> PBitIdx) & 0x1);
}

uint8_t Temp = Imm[BitIdx] ? 1 : 0;
RetByte |= (Temp ^ Parity) << BitIdx;
}
return RetByte;
}

uint8_t GFNIMul(uint8_t AByte, uint8_t BByte) {
// Multiplying two polynomials of degree 7
// Polynomial of degree 7
// x^7 + x^6 + x^5 + x^4 + x^3 + x^2 + x + 1
uint16_t TWord = 0;
unsigned NumBitsInByte = 8;
for (unsigned BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) {
if ((BByte >> BitIdx) & 0x1) {
TWord = TWord ^ (AByte << BitIdx);
}
}

// When multiplying two polynomials of degree 7
// results in a polynomial of degree 14
// so the result has to be reduced to 7
// Reduction polynomial is x^8 + x^4 + x^3 + x + 1 i.e. 0x11B
for (int32_t BitIdx = 14; BitIdx > 7; --BitIdx) {
if ((TWord >> BitIdx) & 0x1) {
TWord = TWord ^ (0x11B << (BitIdx - 8));
}
}
return (TWord & 0xFF);
}

void HandleComplexComplexMul(APFloat A, APFloat B, APFloat C, APFloat D,
APFloat &ResR, APFloat &ResI) {
// This is an implementation of complex multiplication according to the
Expand Down
Loading