From 8d16fb7a55101dd282678bc06f91f8fca3d246ae Mon Sep 17 00:00:00 2001 From: shashank1545 Date: Mon, 13 Oct 2025 14:06:45 +0530 Subject: [PATCH 1/4] [X86][ByteCode] Allow PSHUFB intrinsics to be used in constexpr #156612 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PSHUFB instruction shuffles bytes within each 128-bit lane: for each control byte, if bit 7 is set, the output byte is zeroed; otherwise, the low 4 bits select a source byte (0–15) from the same lane. --- clang/include/clang/Basic/BuiltinsX86.td | 8 +-- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 40 +++++++++++++++ clang/lib/AST/ExprConstant.cpp | 54 ++++++++++++++++++++ clang/lib/Headers/avx2intrin.h | 5 +- clang/lib/Headers/avx512bwintrin.h | 15 +++--- clang/lib/Headers/avx512vlbwintrin.h | 20 +++----- clang/lib/Headers/tmmintrin.h | 21 ++++---- clang/test/CodeGen/X86/avx2-builtins.c | 2 + clang/test/CodeGen/X86/avx512bw-builtins.c | 9 ++++ clang/test/CodeGen/X86/avx512vlbw-builtins.c | 13 +++++ clang/test/CodeGen/X86/mmx-builtins.c | 2 + clang/test/CodeGen/X86/ssse3-builtins.c | 2 + 12 files changed, 153 insertions(+), 38 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 217589d7add1d..d5e6e94ce6010 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -124,7 +124,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { } def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; - def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; @@ -132,6 +131,7 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">; + def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; } } @@ -588,7 +588,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; - def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; @@ -627,6 +626,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; + def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; + def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">; def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">; def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">; @@ -1318,7 +1319,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">; - def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { @@ -1326,6 +1326,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">; def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; + + def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 3811fb072fa16..db3bc98452f0f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2714,6 +2714,41 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); + const Pointer &Control = S.Stk.pop(); + const Pointer &Src = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + unsigned NumElems = Dst.getNumElems(); + PrimType ElemT = Dst.getFieldDesc()->getPrimType(); + unsigned ElemBits = static_cast(primSize(ElemT) * 8); + + assert(NumElems == 16 || NumElems == 32 || NumElems == 64); + assert(NumElems == Control.getNumElems()); + assert(NumElems == Dst.getNumElems()); + + if (ElemBits != 8) + return false; + + for (unsigned Idx = 0; Idx != NumElems; ++Idx) { + uint8_t Ctlb = static_cast(Control.elem(Idx)); + + if (Ctlb & 0x80) { + Dst.elem(Idx) = 0; + } else { + unsigned LaneBase = (Idx / 16) * 16; + unsigned SrcOffset = Ctlb & 0x0F; + unsigned SrcIdx = LaneBase + SrcOffset; + + Dst.elem(Idx) = Src.elem(SrcIdx); + } + } + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, const CallExpr *Call, bool IsShufHW) { assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); @@ -3739,6 +3774,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_selectpd_512: return interp__builtin_select(S, OpPC, Call); + case X86::BI__builtin_ia32_pshufb128: + case X86::BI__builtin_ia32_pshufb256: + case X86::BI__builtin_ia32_pshufb512: + return interp__builtin_ia32_pshufb(S, OpPC, Call); + case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 35a866ea5010f..2e5027d0397c4 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11615,6 +11615,51 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, return true; } +static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call, + APValue &Out) { + APValue SrcVec, ControlVec; + if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec)) + return false; + if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec)) + return false; + + const auto *VT = Call->getType()->getAs(); + if (!VT) + return false; + + QualType ElemT = VT->getElementType(); + unsigned ElemBits = Info.Ctx.getTypeSize(ElemT); + + if (ElemBits != 8) + return false; + unsigned NumElts = VT->getNumElements(); + if (NumElts != 16 && NumElts != 32 && NumElts != 64) + return false; + + SmallVector ResultElements; + ResultElements.reserve(NumElts); + + for (unsigned Idx = 0; Idx != NumElts; ++Idx) { + APValue CtlVal = ControlVec.getVectorElt(Idx); + APSInt CtlByte = CtlVal.getInt(); + uint8_t Ctl = static_cast(CtlByte.getZExtValue() & 0xFF); + + if (Ctl & 0x80) { + APSInt Zero(ElemBits, /*isUnsigned*/ false); + Zero = 0; + ResultElements.push_back(APValue(Zero)); + } else { + unsigned LaneBase = (Idx / 16) * 16; + unsigned SrcOffset = Ctl & 0x0F; + unsigned SrcIdx = LaneBase + SrcOffset; + + ResultElements.push_back(SrcVec.getVectorElt(SrcIdx)); + } + } + Out = APValue(ResultElements.data(), ResultElements.size()); + return true; +} + static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, bool IsShufHW, APValue &Out) { APValue Vec; @@ -12189,6 +12234,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_pshufb128: + case X86::BI__builtin_ia32_pshufb256: + case X86::BI__builtin_ia32_pshufb512: { + APValue R; + if (!evalPshufbBuiltin(Info, E, R)) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: { diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 4aaca2db8236a..f586c1c1d7439 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1858,9 +1858,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b) /// control byte specify the index (within the same 128-bit half) of \a __a /// to copy to the result byte. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shuffle_epi8(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_shuffle_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 473fe94af65d8..23b2d290422b1 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_shuffle_epi8(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_shuffle_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_shuffle_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_shuffle_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 81e4cbb9615c1..639fb60f476c6 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_shuffle_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_shuffle_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_shuffle_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_shuffle_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 3fc9f9834baa9..386c3704a2251 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -603,10 +603,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b) /// Bits [6:4] Reserved. \n /// Bits [3:0] select the source byte to be copied. /// \returns A 128-bit integer vector containing the copied or cleared values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shuffle_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_shuffle_epi8(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); } /// Copies the 8-bit integers from a 64-bit integer vector to the @@ -628,13 +627,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// destination. \n /// Bits [2:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_shuffle_pi8(__m64 __a, __m64 __b) -{ - return __trunc64(__builtin_ia32_pshufb128( - (__v16qi)__builtin_shufflevector( - (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1), - (__v16qi)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_shuffle_pi8(__m64 __a, __m64 __b) { + return __trunc64(__builtin_ia32_pshufb128( + (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){}, + 0, 1, 0, 1), + (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){}, + 0, 1, 0, 1))); } /// For each 8-bit integer in the first source operand, perform one of diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 55f18f947b96f..7826de484f2c7 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1106,6 +1106,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) { return _mm256_shuffle_epi8(a, b); } +TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)); + __m256i test_mm256_shuffle_epi32(__m256i a) { // CHECK-LABEL: test_mm256_shuffle_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index af1c904a6de48..568adaa5a49f4 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.pshuf.b.512 return _mm512_shuffle_epi8(__A,__B); } + +TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)); + __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_epi8 // CHECK: @llvm.x86.avx512.pshuf.b.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_shuffle_epi8(__W,__U,__A,__B); } + +TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48)); + __m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_epi8 // CHECK: @llvm.x86.avx512.pshuf.b.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_shuffle_epi8(__U,__A,__B); } + +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48)); + __m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_subs_epi8 // CHECK: @llvm.ssub.sat.v64i8 diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index c0e46de8b81bf..d569283928a0a 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_shuffle_epi8(__W,__U,__A,__B); } + +TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8)); + __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_shuffle_epi8 // CHECK: @llvm.x86.ssse3.pshuf.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_shuffle_epi8(__U,__A,__B); } + +TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0)); + __m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_epi8 // CHECK: @llvm.x86.avx2.pshuf.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_shuffle_epi8(__W,__U,__A,__B); } + +TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16)); + + __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_epi8 // CHECK: @llvm.x86.avx2.pshuf.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_shuffle_epi8(__U,__A,__B); } + +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)); + __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_subs_epi8 // CHECK: @llvm.ssub.sat.v16i8 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 2b45b920cf2bc..9f2c2f302d515 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -583,6 +583,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) { return _mm_shuffle_pi8(a, b); } +TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0)); + __m64 test_mm_shuffle_pi16(__m64 a) { // CHECK-LABEL: test_mm_shuffle_pi16 // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index 588576879903b..509eec6d05baa 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -110,6 +110,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) { return _mm_shuffle_epi8(a, b); } +TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); + __m128i test_mm_sign_epi8(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_sign_epi8 // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) From ac77cd12eef7d94b3727e2b2336816f164f8fbbd Mon Sep 17 00:00:00 2001 From: shashank1545 Date: Tue, 14 Oct 2025 17:54:23 +0530 Subject: [PATCH 2/4] Address review comments for PSHUFB constexpr evaluation --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 7 ------- clang/lib/AST/ExprConstant.cpp | 10 +++------- clang/lib/Headers/tmmintrin.h | 3 +-- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index db3bc98452f0f..f24417c2f9503 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2722,16 +2722,9 @@ static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC, const Pointer &Dst = S.Stk.peek(); unsigned NumElems = Dst.getNumElems(); - PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - unsigned ElemBits = static_cast(primSize(ElemT) * 8); - - assert(NumElems == 16 || NumElems == 32 || NumElems == 64); assert(NumElems == Control.getNumElems()); assert(NumElems == Dst.getNumElems()); - if (ElemBits != 8) - return false; - for (unsigned Idx = 0; Idx != NumElems; ++Idx) { uint8_t Ctlb = static_cast(Control.elem(Idx)); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 2e5027d0397c4..a21a3b25bb429 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11629,12 +11629,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call, QualType ElemT = VT->getElementType(); unsigned ElemBits = Info.Ctx.getTypeSize(ElemT); - - if (ElemBits != 8) - return false; unsigned NumElts = VT->getNumElements(); - if (NumElts != 16 && NumElts != 32 && NumElts != 64) - return false; + bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType(); SmallVector ResultElements; ResultElements.reserve(NumElts); @@ -11642,10 +11638,10 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call, for (unsigned Idx = 0; Idx != NumElts; ++Idx) { APValue CtlVal = ControlVec.getVectorElt(Idx); APSInt CtlByte = CtlVal.getInt(); - uint8_t Ctl = static_cast(CtlByte.getZExtValue() & 0xFF); + uint8_t Ctl = static_cast(CtlByte.getZExtValue()); if (Ctl & 0x80) { - APSInt Zero(ElemBits, /*isUnsigned*/ false); + APSInt Zero(ElemBits, DestUnsigned); Zero = 0; ResultElements.push_back(APValue(Zero)); } else { diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 386c3704a2251..8bead94bae945 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -632,8 +632,7 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b) { return __trunc64(__builtin_ia32_pshufb128( (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){}, 0, 1, 0, 1), - (__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){}, - 0, 1, 0, 1))); + (__v16qi)__zext128(__b))); } /// For each 8-bit integer in the first source operand, perform one of From 709e3164f48b6438267119b8afaa1aeefb700b71 Mon Sep 17 00:00:00 2001 From: shashank1545 Date: Tue, 14 Oct 2025 18:47:51 +0530 Subject: [PATCH 3/4] Simplify Zero initialization --- clang/lib/AST/ExprConstant.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index a21a3b25bb429..194eb433e5ec8 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11641,9 +11641,8 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call, uint8_t Ctl = static_cast(CtlByte.getZExtValue()); if (Ctl & 0x80) { - APSInt Zero(ElemBits, DestUnsigned); - Zero = 0; - ResultElements.push_back(APValue(Zero)); + APValue Zero(Info.Ctx.MakeIntValue(0, ElemT)); + ResultElements.push_back(Zero); } else { unsigned LaneBase = (Idx / 16) * 16; unsigned SrcOffset = Ctl & 0x0F; From df3cf14c7260c3e7e83ce1dba1ce0d2d61f0ef9e Mon Sep 17 00:00:00 2001 From: shashank1545 Date: Wed, 15 Oct 2025 11:17:52 +0530 Subject: [PATCH 4/4] Removed unused variables and made test more extensive --- clang/lib/AST/ExprConstant.cpp | 2 -- clang/test/CodeGen/X86/avx2-builtins.c | 2 +- clang/test/CodeGen/X86/avx512bw-builtins.c | 2 +- clang/test/CodeGen/X86/ssse3-builtins.c | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 194eb433e5ec8..0cefe6f3b9b31 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11628,9 +11628,7 @@ static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call, return false; QualType ElemT = VT->getElementType(); - unsigned ElemBits = Info.Ctx.getTypeSize(ElemT); unsigned NumElts = VT->getNumElements(); - bool DestUnsigned = ElemT->isUnsignedIntegerOrEnumerationType(); SmallVector ResultElements; ResultElements.reserve(NumElts); diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 7826de484f2c7..39ae5e1bf3fc0 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1106,7 +1106,7 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) { return _mm256_shuffle_epi8(a, b); } -TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31)); +TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qs){0,33,2,35,4,37,6,-39,8,41,10,43,12,45,14,-47,16,49,18,51,20,53,22,-55,24,57,26,59,28,61,30,-63}), 0,1,2,3,4,5,6,0,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,0,24,25,26,27,28,29,30,0)); __m256i test_mm256_shuffle_epi32(__m256i a) { // CHECK-LABEL: test_mm256_shuffle_epi32 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 568adaa5a49f4..fddf17d524310 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1467,7 +1467,7 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) { return _mm512_shuffle_epi8(__A,__B); } -TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63)); +TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,-79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,-95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,0,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,0)); __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_epi8 diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c index 509eec6d05baa..fb569be398111 100644 --- a/clang/test/CodeGen/X86/ssse3-builtins.c +++ b/clang/test/CodeGen/X86/ssse3-builtins.c @@ -110,7 +110,7 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) { return _mm_shuffle_epi8(a, b); } -TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0)); +TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qs){0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15}, (__m128i)(__v16qs){15,-14,13,-12,11,-10,9,-8,7,-6,5,-4,3,-2,1,0}), -15,0,-13,0,-11,0,-9,0,-7,0,-5,0,-3,0,-1,0)); __m128i test_mm_sign_epi8(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_sign_epi8