Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,14 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {

let Features = "ssse3" in {
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}

let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}
}

Expand Down Expand Up @@ -610,7 +610,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
Expand Down Expand Up @@ -649,6 +648,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;

def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;

def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
Expand Down Expand Up @@ -1347,14 +1348,15 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>

let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;

def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
Expand Down
33 changes: 33 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2790,6 +2790,34 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
const Pointer &Control = S.Stk.pop<Pointer>();
const Pointer &Src = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

unsigned NumElems = Dst.getNumElems();
assert(NumElems == Control.getNumElems());
assert(NumElems == Dst.getNumElems());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are asserting the assignment right before the asserts? Was that what you meant to do?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was a mistake, it should compare Dst with Src and Contol.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need these? The ia32 builtin defs are pretty strict

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really the asserts can be removed completely.


for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));

if (Ctlb & 0x80) {
Dst.elem<int8_t>(Idx) = 0;
} else {
unsigned LaneBase = (Idx / 16) * 16;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this just a right shift and a left shift? That would seem more readable code, intent wise.

Also 16 is a magic number, can we name it.

Same applies in the code below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can change that ,so LaneBase will then be (Idx >> 4) <<4, then there is no need for naming 16 , and i can also name 0x80 and 0x0F MSBMask and LowNibbleMask for clarity.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if MSBMask/LowNibbleMask obfuscation is a great idea - keeping reasonably close to the psuedocode in the Intel Intrinsics Guide is probably a better guideline.

unsigned SrcOffset = Ctlb & 0x0F;
unsigned SrcIdx = LaneBase + SrcOffset;

Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
}
}
Dst.initializeAllElements();
return true;
}

static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
const CallExpr *Call, bool IsShufHW) {
assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
Expand Down Expand Up @@ -3943,6 +3971,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);

case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
return interp__builtin_ia32_pshufb(S, OpPC, Call);

case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
Expand Down
47 changes: 47 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11619,6 +11619,44 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
return true;
}

static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is really a shame we are duplicating code so much. If we find a bug in this implementation, will folks really remember to fix it in both places? @RKSimon @tbaederr

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially explored evalPshufBuiltin, but since the function handles the control as a scalar and applies different shuffle logic, integrating the pshufb seemed to require heavy conditional branching which i thought might complicate readability. However, this is my first LLVM contribution and i completely understand if i am missing some broader context. I am happy to rework this or refactor both implementations together if that would be a good long term solution.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been looking at options for using a callback mechanism for shuffle mask decoding - similar to what we do for binops - I've suggested this to @chaitanyav on #164078 and we can build on that.

APValue &Out) {
APValue SrcVec, ControlVec;
if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
return false;
if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
return false;

const auto *VT = Call->getType()->getAs<VectorType>();
if (!VT)
return false;

QualType ElemT = VT->getElementType();
unsigned NumElts = VT->getNumElements();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have assertions in interp__builtin_ia32_pshufb we should have them here too, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The definition in BuiltnsX86.td will enforce them, so we can remove them from the InterpBuiltin too.


SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);

for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
APValue CtlVal = ControlVec.getVectorElt(Idx);
APSInt CtlByte = CtlVal.getInt();
uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());

if (Ctl & 0x80) {
APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
ResultElements.push_back(Zero);
} else {
unsigned LaneBase = (Idx / 16) * 16;
unsigned SrcOffset = Ctl & 0x0F;
unsigned SrcIdx = LaneBase + SrcOffset;

ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
}
}
Out = APValue(ResultElements.data(), ResultElements.size());
return true;
}

static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
bool IsShufHW, APValue &Out) {
APValue Vec;
Expand Down Expand Up @@ -12241,6 +12279,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512: {
APValue R;
if (!evalPshufbBuiltin(Info, E, R))
return false;
return Success(R, E);
}

case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512: {
Expand Down
5 changes: 2 additions & 3 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1852,9 +1852,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
/// control byte specify the index (within the same 128-bit half) of \a __a
/// to copy to the result byte.
/// \returns A 256-bit integer vector containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shuffle_epi8(__m256i __a, __m256i __b)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
}

Expand Down
15 changes: 6 additions & 9 deletions clang/lib/Headers/avx512bwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
(__v32hi)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_shuffle_epi8(__m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
Expand Down
20 changes: 8 additions & 12 deletions clang/lib/Headers/avx512vlbwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
(__v16hi)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)__W);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
Expand Down
20 changes: 9 additions & 11 deletions clang/lib/Headers/tmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,10 +590,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_shuffle_epi8(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}

/// Copies the 8-bit integers from a 64-bit integer vector to the
Expand All @@ -615,13 +614,12 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
/// destination. \n
/// Bits [2:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
return __trunc64(__builtin_ia32_pshufb128(
(__v16qi)__builtin_shufflevector(
(__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
(__v16qi)__anyext128(__b)));
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_shuffle_pi8(__m64 __a, __m64 __b) {
return __trunc64(__builtin_ia32_pshufb128(
(__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
0, 1, 0, 1),
(__v16qi)__zext128(__b)));
}

/// For each 8-bit integer in the first source operand, perform one of
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/avx2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
return _mm256_shuffle_epi8(a, b);
}

TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qs){0,33,2,35,4,37,6,-39,8,41,10,43,12,45,14,-47,16,49,18,51,20,53,22,-55,24,57,26,59,28,61,30,-63}), 0,1,2,3,4,5,6,0,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,0,24,25,26,27,28,29,30,0));

__m256i test_mm256_shuffle_epi32(__m256i a) {
// CHECK-LABEL: test_mm256_shuffle_epi32
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
Expand Down
9 changes: 9 additions & 0 deletions clang/test/CodeGen/X86/avx512bw-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.pshuf.b.512
return _mm512_shuffle_epi8(__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,-79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,-95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,0,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,0));

__m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48));

__m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48));

__m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_subs_epi8
// CHECK: @llvm.ssub.sat.v64i8
Expand Down
13 changes: 13 additions & 0 deletions clang/test/CodeGen/X86/avx512vlbw-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8));

__m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_shuffle_epi8
// CHECK: @llvm.x86.ssse3.pshuf.b
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0));

__m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16));


__m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));

__m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_subs_epi8
// CHECK: @llvm.ssub.sat.v16i8
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/mmx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
return _mm_shuffle_pi8(a, b);
}

TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0));

__m64 test_mm_shuffle_pi16(__m64 a) {
// CHECK-LABEL: test_mm_shuffle_pi16
// CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/ssse3-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
return _mm_shuffle_epi8(a, b);
}

TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qs){0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15}, (__m128i)(__v16qs){15,-14,13,-12,11,-10,9,-8,7,-6,5,-4,3,-2,1,0}), -15,0,-13,0,-11,0,-9,0,-7,0,-5,0,-3,0,-1,0));

__m128i test_mm_sign_epi8(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sign_epi8
// CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
Expand Down
Loading