From f5662479038d0943e07690047526c8df0bfa4a09 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Sun, 23 Nov 2025 18:45:35 -0800 Subject: [PATCH 1/5] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow PSLL/PSRA/PSRL var intrinsics to be used in constexpr Resolves:#169176 --- clang/include/clang/Basic/BuiltinsX86.td | 60 ++++------ clang/lib/AST/ByteCode/InterpBuiltin.cpp | 105 +++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 106 +++++++++++++++++ clang/lib/Headers/avx2intrin.h | 40 +++---- clang/lib/Headers/avx512bwintrin.h | 45 +++---- clang/lib/Headers/avx512fintrin.h | 90 ++++++-------- clang/lib/Headers/avx512vlintrin.h | 130 +++++++++------------ clang/lib/Headers/emmintrin.h | 32 ++--- clang/test/CodeGen/X86/avx2-builtins.c | 34 ++++++ clang/test/CodeGen/X86/avx512bw-builtins.c | 27 ++++- clang/test/CodeGen/X86/avx512f-builtins.c | 54 +++++++-- clang/test/CodeGen/X86/avx512vl-builtins.c | 41 +++++-- clang/test/CodeGen/X86/sse2-builtins.c | 16 +++ 13 files changed, 520 insertions(+), 260 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index e82d7a8f62d8c..98cea35beb0ea 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -210,17 +210,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in { def _mm_pause : X86LibBuiltin<"void()">; } -let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; - def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; - def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; - def psrld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; - def psrlq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; - def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; - def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; - def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; -} - let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def movmskpd : X86Builtin<"int(_Vector<2, double>)">; def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">; @@ -261,6 +250,15 @@ let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">; def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">; def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">; + + def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; + def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; + def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; + def psrld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; + def psrlq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; + def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; + def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; + def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; } let Features = "sse3", Attributes = [NoThrow] in { @@ -579,14 +577,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psadbw256 : X86Builtin< "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; - def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; - def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; @@ -663,6 +653,15 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; + + def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; + def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; + def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; + def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; + def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; + def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; + def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; + def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { @@ -1926,16 +1925,13 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVect def prorq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; -} - let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">; def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">; def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; + def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { @@ -1991,7 +1987,7 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVect def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psraw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; def psrlw512 @@ -2308,25 +2304,17 @@ let Features = "avx512f", def psraqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psraq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def psraq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; -} - -let Features = "avx512vl", - Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psraqi128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, int)">; } -let Features = "avx512vl", - Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def psraq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; def psraqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pslld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">; def psllq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">; def psrad512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8874d9e3f6222..d21f42d94d3a5 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3468,6 +3468,69 @@ static bool interp__builtin_ia32_shuffle_generic( return true; } +static bool interp__builtin_ia32_shift_with_count( + InterpState &S, CodePtr OpPC, const CallExpr *Call, + llvm::function_ref ShiftOp, + llvm::function_ref OverflowOp) { + + assert(Call->getNumArgs() == 2); + + const Pointer &Count = S.Stk.pop(); + const Pointer &Source = S.Stk.pop(); + + QualType SourceType = Call->getArg(0)->getType(); + QualType CountType = Call->getArg(1)->getType(); + assert(SourceType->isVectorType() && CountType->isVectorType()); + + const auto *SourceVecT = SourceType->castAs(); + const auto *CountVecT = CountType->castAs(); + PrimType SourceElemT = *S.getContext().classify(SourceVecT->getElementType()); + PrimType CountElemT = *S.getContext().classify(CountVecT->getElementType()); + + const Pointer &Dst = S.Stk.peek(); + + unsigned DestEltWidth = + S.getASTContext().getTypeSize(SourceVecT->getElementType()); + bool IsDestUnsigned = SourceVecT->getElementType()->isUnsignedIntegerType(); + unsigned DestLen = SourceVecT->getNumElements(); + unsigned CountEltWidth = + S.getASTContext().getTypeSize(CountVecT->getElementType()); + unsigned NumBitsInQWord = 64; + unsigned NumCountElts = NumBitsInQWord / CountEltWidth; + + uint64_t CountLQWord = 0; + for (unsigned EltIdx = 0; EltIdx != NumCountElts; ++EltIdx) { + uint64_t Elt = 0; + INT_TYPE_SWITCH(CountElemT, + { Elt = static_cast(Count.elem(EltIdx)); }); + CountLQWord |= (Elt << (EltIdx * CountEltWidth)); + } + + for (unsigned EltIdx = 0; EltIdx != DestLen; ++EltIdx) { + APSInt Elt; + INT_TYPE_SWITCH(SourceElemT, { Elt = Source.elem(EltIdx).toAPSInt(); }); + + APInt Result; + if (CountLQWord < DestEltWidth) { + Result = ShiftOp(Elt, CountLQWord); + } else { + Result = OverflowOp(Elt, DestEltWidth); + } + if (IsDestUnsigned) { + INT_TYPE_SWITCH(SourceElemT, { + Dst.elem(EltIdx) = T::from(Result.getZExtValue()); + }); + } else { + INT_TYPE_SWITCH(SourceElemT, { + Dst.elem(EltIdx) = T::from(Result.getSExtValue()); + }); + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_ia32_shufbitqmb_mask(InterpState &S, CodePtr OpPC, const CallExpr *Call) { @@ -4971,6 +5034,48 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_phminposuw128: return interp__builtin_ia32_phminposuw(S, OpPC, Call); + case X86::BI__builtin_ia32_psraq128: + case X86::BI__builtin_ia32_psraq256: + case X86::BI__builtin_ia32_psraq512: + case X86::BI__builtin_ia32_psrad128: + case X86::BI__builtin_ia32_psrad256: + case X86::BI__builtin_ia32_psrad512: + case X86::BI__builtin_ia32_psraw128: + case X86::BI__builtin_ia32_psraw256: + case X86::BI__builtin_ia32_psraw512: + return interp__builtin_ia32_shift_with_count( + S, OpPC, Call, + [](const APInt &Elt, uint64_t Count) { return Elt.ashr(Count); }, + [](const APInt &Elt, unsigned Width) { return Elt.ashr(Width - 1); }); + + case X86::BI__builtin_ia32_psllq128: + case X86::BI__builtin_ia32_psllq256: + case X86::BI__builtin_ia32_psllq512: + case X86::BI__builtin_ia32_pslld128: + case X86::BI__builtin_ia32_pslld256: + case X86::BI__builtin_ia32_pslld512: + case X86::BI__builtin_ia32_psllw128: + case X86::BI__builtin_ia32_psllw256: + case X86::BI__builtin_ia32_psllw512: + return interp__builtin_ia32_shift_with_count( + S, OpPC, Call, + [](const APInt &Elt, uint64_t Count) { return Elt.shl(Count); }, + [](const APInt &Elt, unsigned Width) { return APInt::getZero(Width); }); + + case X86::BI__builtin_ia32_psrlq128: + case X86::BI__builtin_ia32_psrlq256: + case X86::BI__builtin_ia32_psrlq512: + case X86::BI__builtin_ia32_psrld128: + case X86::BI__builtin_ia32_psrld256: + case X86::BI__builtin_ia32_psrld512: + case X86::BI__builtin_ia32_psrlw128: + case X86::BI__builtin_ia32_psrlw256: + case X86::BI__builtin_ia32_psrlw512: + return interp__builtin_ia32_shift_with_count( + S, OpPC, Call, + [](const APInt &Elt, uint64_t Count) { return Elt.lshr(Count); }, + [](const APInt &Elt, unsigned Width) { return APInt::getZero(Width); }); + case X86::BI__builtin_ia32_pternlogd128_mask: case X86::BI__builtin_ia32_pternlogd256_mask: case X86::BI__builtin_ia32_pternlogd512_mask: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7e2fa7debdd87..7b9380de6834d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12166,6 +12166,52 @@ static bool evalShuffleGeneric( return true; } +static bool evalShiftWithCount( + EvalInfo &Info, const CallExpr *Call, APValue &Out, + llvm::function_ref ShiftOp, + llvm::function_ref OverflowOp) { + + APValue Source, Count; + if (!EvaluateAsRValue(Info, Call->getArg(0), Source) || + !EvaluateAsRValue(Info, Call->getArg(1), Count)) + return false; + + assert(Call->getNumArgs() == 2); + + QualType SourceTy = Call->getArg(0)->getType(); + QualType CountTy = Call->getArg(1)->getType(); + assert(SourceTy->isVectorType() && CountTy->isVectorType()); + + QualType DestEltTy = SourceTy->castAs()->getElementType(); + unsigned DestEltWidth = Source.getVectorElt(0).getInt().getBitWidth(); + unsigned DestLen = Source.getVectorLength(); + bool IsDestUnsigned = DestEltTy->isUnsignedIntegerType(); + unsigned CountEltWidth = Count.getVectorElt(0).getInt().getBitWidth(); + unsigned NumBitsInQWord = 64; + unsigned NumCountElts = NumBitsInQWord / CountEltWidth; + SmallVector Result; + Result.reserve(DestLen); + + uint64_t CountLQWord = 0; + for (unsigned EltIdx = 0; EltIdx != NumCountElts; ++EltIdx) { + uint64_t Elt = Count.getVectorElt(EltIdx).getInt().getZExtValue(); + CountLQWord |= (Elt << (EltIdx * CountEltWidth)); + } + + for (unsigned EltIdx = 0; EltIdx != DestLen; ++EltIdx) { + APInt Elt = Source.getVectorElt(EltIdx).getInt(); + if (CountLQWord < DestEltWidth) { + Result.push_back( + APValue(APSInt(ShiftOp(Elt, CountLQWord), IsDestUnsigned))); + } else { + Result.push_back( + APValue(APSInt(OverflowOp(Elt, DestEltWidth), IsDestUnsigned))); + } + } + Out = APValue(Result.data(), Result.size()); + return true; +} + bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!IsConstantEvaluatedBuiltinCall(E)) return ExprEvaluatorBaseTy::VisitCallExpr(E); @@ -13169,6 +13215,66 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(Result.data(), Result.size()), E); } + case X86::BI__builtin_ia32_psraq128: + case X86::BI__builtin_ia32_psraq256: + case X86::BI__builtin_ia32_psraq512: + case X86::BI__builtin_ia32_psrad128: + case X86::BI__builtin_ia32_psrad256: + case X86::BI__builtin_ia32_psrad512: + case X86::BI__builtin_ia32_psraw128: + case X86::BI__builtin_ia32_psraw256: + case X86::BI__builtin_ia32_psraw512: { + APValue R; + if (!evalShiftWithCount( + Info, E, R, + [](const APInt &Elt, uint64_t Count) { return Elt.ashr(Count); }, + [](const APInt &Elt, unsigned Width) { + return Elt.ashr(Width - 1); + })) + return false; + return Success(R, E); + } + + case X86::BI__builtin_ia32_psllq128: + case X86::BI__builtin_ia32_psllq256: + case X86::BI__builtin_ia32_psllq512: + case X86::BI__builtin_ia32_pslld128: + case X86::BI__builtin_ia32_pslld256: + case X86::BI__builtin_ia32_pslld512: + case X86::BI__builtin_ia32_psllw128: + case X86::BI__builtin_ia32_psllw256: + case X86::BI__builtin_ia32_psllw512: { + APValue R; + if (!evalShiftWithCount( + Info, E, R, + [](const APInt &Elt, uint64_t Count) { return Elt.shl(Count); }, + [](const APInt &Elt, unsigned Width) { + return APInt::getZero(Width); + })) + return false; + return Success(R, E); + } + + case X86::BI__builtin_ia32_psrlq128: + case X86::BI__builtin_ia32_psrlq256: + case X86::BI__builtin_ia32_psrlq512: + case X86::BI__builtin_ia32_psrld128: + case X86::BI__builtin_ia32_psrld256: + case X86::BI__builtin_ia32_psrld512: + case X86::BI__builtin_ia32_psrlw128: + case X86::BI__builtin_ia32_psrlw256: + case X86::BI__builtin_ia32_psrlw512: { + APValue R; + if (!evalShiftWithCount( + Info, E, R, + [](const APInt &Elt, uint64_t Count) { return Elt.lshr(Count); }, + [](const APInt &Elt, unsigned Width) { + return APInt::getZero(Width); + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_pternlogd128_mask: case X86::BI__builtin_ia32_pternlogd256_mask: case X86::BI__builtin_ia32_pternlogd512_mask: diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 3e3c13d8bd662..d3ceb2327ac62 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -2095,9 +2095,8 @@ _mm256_slli_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); } @@ -2134,9 +2133,8 @@ _mm256_slli_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); } @@ -2173,9 +2171,8 @@ _mm256_slli_epi64(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi64(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sll_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psllq256((__v4di)__a, __count); } @@ -2214,9 +2211,8 @@ _mm256_srai_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); } @@ -2255,9 +2251,8 @@ _mm256_srai_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); } @@ -2336,9 +2331,8 @@ _mm256_srli_epi16(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi16(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); } @@ -2375,9 +2369,8 @@ _mm256_srli_epi32(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi32(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); } @@ -2414,9 +2407,8 @@ _mm256_srli_epi64(__m256i __a, int __count) { /// A 128-bit vector of [2 x i64] whose lower element gives the unsigned /// shift count (in bits). The upper element is ignored. /// \returns A 256-bit vector of [4 x i64] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi64(__m256i __a, __m128i __count) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_srl_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psrlq256((__v4di)__a, __count); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 53d641e9e2eae..67e8461560b04 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1383,23 +1383,20 @@ _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sll_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sll_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sll_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1473,23 +1470,20 @@ _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sra_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sra_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -1515,23 +1509,20 @@ _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi16(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srl_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srl_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index e1de56069870b..806a13c414c10 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -5382,45 +5382,39 @@ _mm512_kmov (__mmask16 __A) ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i + __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_sll_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sll_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sll_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sll_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sll_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sll_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5467,45 +5461,39 @@ _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sra_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_sra_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_sra_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sra_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_sra_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); @@ -5552,45 +5540,39 @@ _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi32(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srl_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srl_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi64(__m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_srl_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srl_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, (__v8di)_mm512_srl_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 99c057030a4cc..388f99d812312 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -4299,33 +4299,29 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_ror_epi64((a), (b)), \ (__v4di)_mm256_setzero_si256())) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sll_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sll_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sll_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sll_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -4362,33 +4358,29 @@ _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sll_epi64(__A, __B), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sll_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_sll_epi64(__A, __B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_sll_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); @@ -4641,33 +4633,29 @@ _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srl_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_srl_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srl_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_srl_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -4704,33 +4692,29 @@ _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srl_epi64(__A, __B), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srl_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srl_epi64(__A, __B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_srl_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); @@ -6127,33 +6111,29 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sra_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_sra_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sra_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_sra_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); @@ -6188,45 +6168,39 @@ _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) { (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sra_epi64(__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_sra_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_sra_epi64(__A, __B), \ (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_sra_epi64(__A, __B), \ (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi64(__m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_sra_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_sra_epi64(__A, __B), \ (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ (__v4di)_mm256_sra_epi64(__A, __B), \ (__v4di)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 7eb15e06db698..9d71c69878e47 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2782,8 +2782,8 @@ _mm_slli_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); } @@ -2818,8 +2818,8 @@ _mm_slli_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); } @@ -2854,8 +2854,8 @@ _mm_slli_epi64(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to left-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sll_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); } @@ -2892,8 +2892,8 @@ _mm_srai_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sra_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); } @@ -2930,8 +2930,8 @@ _mm_srai_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_sra_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); } @@ -2991,8 +2991,8 @@ _mm_srli_epi16(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); } @@ -3027,8 +3027,8 @@ _mm_srli_epi32(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); } @@ -3063,8 +3063,8 @@ _mm_srli_epi64(__m128i __a, int __count) { /// A 128-bit integer vector in which bits [63:0] specify the number of bits /// to right-shift each value in operand \a __a. /// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, - __m128i __count) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_srl_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); } diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 6a884e98e9f3b..398bf0c90105f 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1191,6 +1191,30 @@ __m256i test_mm256_sign_epi32(__m256i a, __m256i b) { } TEST_CONSTEXPR(match_v8si(_mm256_sign_epi32((__m256i)(__v8si){0xbeef,0xfeed,0xbead,0xdeed, -1,2,-3,4}, (__m256i)(__v8si){0,0,0,0,-1,-1,-1,-1}), 0,0,0,0, 1,-2,3,-4)); +__m256i test_mm256_sll_epi16(__m256i a, __m128i b) { + // CHECK-LABEL: test_mm256_sll_epi16 + // CHECK: call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}}) + return _mm256_sll_epi16(a, b); +} +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + +__m256i test_mm256_sll_epi32(__m256i a, __m128i b) { + // CHECK-LABEL: test_mm256_sll_epi32 + // CHECK: call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}) + return _mm256_sll_epi32(a, b); +} +TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14)); +TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); + +__m256i test_mm256_sll_epi64(__m256i a, __m128i b) { + // CHECK-LABEL: test_mm256_sll_epi64 + // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm256_sll_epi64(a, b); +} +TEST_CONSTEXPR(match_v4di(_mm256_sll_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 2, 4, 6)); +TEST_CONSTEXPR(match_v4di(_mm256_sll_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0)); + __m256i test_mm256_slli_epi16(__m256i a) { // CHECK-LABEL: test_mm256_slli_epi16 // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}}) @@ -1283,12 +1307,16 @@ __m256i test_mm256_sra_epi16(__m256i a, __m128i b) { // CHECK: call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm256_sra_epi16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), -16384, 16383, -2, -1, -1, 0, 0, 1, -16384, 16383, -2, -1, -1, 0, 0, 1)); +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0, 0)); __m256i test_mm256_sra_epi32(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_sra_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm256_sra_epi32(a, b); } +TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){1, 0, 0, 0}), -16384, 16383, -2, -1, -1, 0, 0, 1)); +TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){32, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0)); __m256i test_mm256_srai_epi16(__m256i a) { // CHECK-LABEL: test_mm256_srai_epi16 @@ -1335,18 +1363,24 @@ __m256i test_mm256_srl_epi16(__m256i a, __m128i b) { // CHECK: call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm256_srl_epi16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)); +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m256i test_mm256_srl_epi32(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_srl_epi32 // CHECK:call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm256_srl_epi32(a, b); } +TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 1, 1, 2, 2, 3, 3)); +TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); __m256i test_mm256_srl_epi64(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_srl_epi64 // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm256_srl_epi64(a, b); } +TEST_CONSTEXPR(match_v4di(_mm256_srl_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 0, 1, 1)); +TEST_CONSTEXPR(match_v4di(_mm256_srl_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0)); __m256i test_mm256_srli_epi16(__m256i a) { // CHECK-LABEL: test_mm256_srli_epi16 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 091d0c40f0ffa..5a653cb3fa0a3 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -2297,8 +2297,11 @@ TEST_CONSTEXPR(match_v32hi(_mm512_maskz_sllv_epi16(0xB120676B, (__m512i)(__v32hi __m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sll_epi16 // CHECK: @llvm.x86.avx512.psll.w.512 - return _mm512_sll_epi16(__A, __B); + return _mm512_sll_epi16(__A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi16 @@ -2311,8 +2314,10 @@ __m512i test_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sll_epi16 // CHECK: @llvm.x86.avx512.psll.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_maskz_sll_epi16(__U, __A, __B); + return _mm512_maskz_sll_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_sll_epi16((__m512i)(__v32hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A5A5A, (__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 99, 2, 99, 6, 8, 99, 12, 99, 99, 18, 99, 22, 24, 99, 28, 99, 99, 34, 99, 38, 40, 99, 44, 99, 99, 50, 99, 54, 56, 99, 60, 99)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_sll_epi16(0xA5A5A5A5, (__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 4, 0, 0, 10, 0, 14, 16, 0, 20, 0, 0, 26, 0, 30, 32, 0, 36, 0, 0, 42, 0, 46, 48, 0, 52, 0, 0, 58, 0, 62)); __m512i test_mm512_slli_epi16(__m512i __A) { // CHECK-LABEL: test_mm512_slli_epi16 @@ -2422,8 +2427,11 @@ TEST_CONSTEXPR(match_v32hi(_mm512_maskz_srav_epi16(0xB120676B, (__m512i)(__v32hi __m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sra_epi16 // CHECK: @llvm.x86.avx512.psra.w.512 - return _mm512_sra_epi16(__A, __B); + return _mm512_sra_epi16(__A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); __m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi16 @@ -2436,8 +2444,10 @@ __m512i test_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sra_epi16 // CHECK: @llvm.x86.avx512.psra.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_maskz_sra_epi16(__U, __A, __B); + return _mm512_maskz_sra_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_sra_epi16((__m512i)(__v32hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A5A5A, (__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 99, -1, 99, -3, 4, 99, 6, 99, 99, -9, 99, -11, 12, 99, 14, 99, 99, -17, 99, -19, 20, 99, 22, 99, 99, -25, 99, -27, 28, 99, 30, 99)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_sra_epi16(0xA5A5A5A5, (__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 2, 0, 0, -5, 0, -7, 8, 0, 10, 0, 0, -13, 0, -15, 16, 0, 18, 0, 0, -21, 0, -23, 24, 0, 26, 0, 0, -29, 0, -31)); __m512i test_mm512_srai_epi16(__m512i __A) { // CHECK-LABEL: test_mm512_srai_epi16 @@ -2485,8 +2495,11 @@ __m512i test_mm512_maskz_srai_epi16_2(__mmask32 __U, __m512i __A, unsigned int _ __m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_srl_epi16 // CHECK: @llvm.x86.avx512.psrl.w.512 - return _mm512_srl_epi16(__A, __B); + return _mm512_srl_epi16(__A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 32767, 2, 32765, 4, 32763, 6, 32761, 8, 32759, 10, 32757, 12, 32755, 14, 32753, 16, 32751, 18, 32749, 20, 32747, 22, 32745, 24, 32743, 26, 32741, 28, 32739, 30, 32737)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi16 @@ -2499,8 +2512,10 @@ __m512i test_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_srl_epi16 // CHECK: @llvm.x86.avx512.psrl.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_maskz_srl_epi16(__U, __A, __B); + return _mm512_maskz_srl_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_srl_epi16((__m512i)(__v32hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A5A5A, (__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 99, 32767, 99, 32765, 4, 99, 6, 99, 99, 32759, 99, 32757, 12, 99, 14, 99, 99, 32751, 99, 32749, 20, 99, 22, 99, 99, 32743, 99, 32741, 28, 99, 30, 99)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_srl_epi16(0xA5A5A5A5, (__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 2, 0, 0, 32763, 0, 32761, 8, 0, 10, 0, 0, 32755, 0, 32753, 16, 0, 18, 0, 0, 32747, 0, 32745, 24, 0, 26, 0, 0, 32739, 0, 32737)); __m512i test_mm512_srli_epi16(__m512i __A) { // CHECK-LABEL: test_mm512_srli_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 753b4e48c20ed..9b1dae2228eb4 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6282,8 +6282,11 @@ __m512i test_mm512_maskz_srai_epi64_2(__mmask8 __U, __m512i __A, unsigned int __ __m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sll_epi32 // CHECK: @llvm.x86.avx512.psll.d.512 - return _mm512_sll_epi32(__A, __B); + return _mm512_sll_epi32(__A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); +TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){33, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi32 @@ -6296,14 +6299,19 @@ __m512i test_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sll_epi32 // CHECK: @llvm.x86.avx512.psll.d.512 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} - return _mm512_maskz_sll_epi32(__U, __A, __B); + return _mm512_maskz_sll_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_sll_epi32((__m512i)(__v16si){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A, (__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){1, 0, 0, 0}), 99, 2, 99, 6, 8, 99, 12, 99, 99, 18, 99, 22, 24, 99, 28, 99)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_sll_epi32(0xA5A5, (__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 4, 0, 0, 10, 0, 14, 16, 0, 20, 0, 0, 26, 0, 30)); __m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sll_epi64 // CHECK: @llvm.x86.avx512.psll.q.512 - return _mm512_sll_epi64(__A, __B); + return _mm512_sll_epi64(__A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){1, 0}), 0, 2, 4, 6, 8, 10, 12, 14)); +TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){65, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi64 @@ -6316,8 +6324,10 @@ __m512i test_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sll_epi64 // CHECK: @llvm.x86.avx512.psll.q.512 // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} - return _mm512_maskz_sll_epi64(__U, __A, __B); + return _mm512_maskz_sll_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_sll_epi64((__m512i)(__v8di){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){1, 0}), 99, 2, 99, 6, 8, 99, 12, 99)); +TEST_CONSTEXPR(match_v8di(_mm512_maskz_sll_epi64(0xA5, (__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){1, 0}), 0, 0, 4, 0, 0, 10, 0, 14)); __m512i test_mm512_sllv_epi32(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_sllv_epi32 @@ -6368,8 +6378,11 @@ TEST_CONSTEXPR(match_v8di(_mm512_maskz_sllv_epi64(0xE4, (__m512i)(__v8di){ 16, - __m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sra_epi32 // CHECK: @llvm.x86.avx512.psra.d.512 - return _mm512_sra_epi32(__A, __B); + return _mm512_sra_epi32(__A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15)); +TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){32, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){33, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); __m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi32 @@ -6382,14 +6395,19 @@ __m512i test_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sra_epi32 // CHECK: @llvm.x86.avx512.psra.d.512 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} - return _mm512_maskz_sra_epi32(__U, __A, __B); + return _mm512_maskz_sra_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_sra_epi32((__m512i)(__v16si){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A, (__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 99, -1, 99, -3, 4, 99, 6, 99, 99, -9, 99, -11, 12, 99, 14, 99)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_sra_epi32(0xA5A5, (__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 2, 0, 0, -5, 0, -7, 8, 0, 10, 0, 0, -13, 0, -15)); __m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_sra_epi64 // CHECK: @llvm.x86.avx512.psra.q.512 - return _mm512_sra_epi64(__A, __B); + return _mm512_sra_epi64(__A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, -1, 2, -3, 4, -5, 6, -7)); +TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){64, 0}), 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){65, 0}), 0, -1, 0, -1, 0, -1, 0, -1)); __m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi64 @@ -6402,8 +6420,10 @@ __m512i test_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_sra_epi64 // CHECK: @llvm.x86.avx512.psra.q.512 // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} - return _mm512_maskz_sra_epi64(__U, __A, __B); + return _mm512_maskz_sra_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_sra_epi64((__m512i)(__v8di){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 99, -1, 99, -3, 4, 99, 6, 99)); +TEST_CONSTEXPR(match_v8di(_mm512_maskz_sra_epi64(0xA5, (__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, 0, 2, 0, 0, -5, 0, -7)); __m512i test_mm512_srav_epi32(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_srav_epi32 @@ -6454,8 +6474,11 @@ TEST_CONSTEXPR(match_v8di(_mm512_maskz_srav_epi64(0xE4, (__m512i)(__v8di){ 16, - __m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_srl_epi32 // CHECK: @llvm.x86.avx512.psrl.d.512 - return _mm512_srl_epi32(__A, __B); + return _mm512_srl_epi32(__A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2147483647, 2, 2147483645, 4, 2147483643, 6, 2147483641, 8, 2147483639, 10, 2147483637, 12, 2147483635, 14, 2147483633)); +TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){33, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi32 @@ -6468,14 +6491,19 @@ __m512i test_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_srl_epi32 // CHECK: @llvm.x86.avx512.psrl.d.512 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} - return _mm512_maskz_srl_epi32(__U, __A, __B); + return _mm512_maskz_srl_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_srl_epi32((__m512i)(__v16si){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0x5A5A, (__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 99, 2147483647, 99, 2147483645, 4, 99, 6, 99, 99, 2147483639, 99, 2147483637, 12, 99, 14, 99)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_srl_epi32(0xA5A5, (__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 2, 0, 0, 2147483643, 0, 2147483641, 8, 0, 10, 0, 0, 2147483635, 0, 2147483633)); __m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_srl_epi64 // CHECK: @llvm.x86.avx512.psrl.q.512 - return _mm512_srl_epi64(__A, __B); + return _mm512_srl_epi64(__A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, 9223372036854775807, 2, 9223372036854775805, 4, 9223372036854775803, 6, 9223372036854775801)); +TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){65, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi64 @@ -6488,8 +6516,10 @@ __m512i test_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_maskz_srl_epi64 // CHECK: @llvm.x86.avx512.psrl.q.512 // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} - return _mm512_maskz_srl_epi64(__U, __A, __B); + return _mm512_maskz_srl_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_srl_epi64((__m512i)(__v8di){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 99, 9223372036854775807, 99, 9223372036854775805, 4, 99, 6, 99)); +TEST_CONSTEXPR(match_v8di(_mm512_maskz_srl_epi64(0xA5, (__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, 0, 2, 0, 0, 9223372036854775803, 0, 9223372036854775801)); __m512i test_mm512_srlv_epi32(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_srlv_epi32 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index e1878bd84b1ad..5f6d8360888f5 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -6746,8 +6746,12 @@ __m256i test_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_srl_epi32 // CHECK: @llvm.x86.avx2.psrl.d // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} - return _mm256_maskz_srl_epi32(__U, __A, __B); + return _mm256_maskz_srl_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v4si(_mm_mask_srl_epi32((__m128i)(__v4si){99, 99, 99, 99}, 0x5, (__m128i)(__v4si){-2, 4, -6, 8}, (__m128i)(__v4si){1, 0, 0, 0}), 2147483647, 99, 2147483645, 99)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_srl_epi32(0xA, (__m128i)(__v4si){-2, 4, -6, 8}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 0, 4)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_srl_epi32((__m256i)(__v8si){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m256i)(__v8si){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v4si){1, 0, 0, 0}), 99, 2147483647, 99, 2147483645, 4, 99, 6, 99)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_srl_epi32(0xA5, (__m256i)(__v8si){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 2, 0, 0, 2147483643, 0, 2147483641)); __m128i test_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_srli_epi32 @@ -6831,8 +6835,12 @@ __m256i test_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_srl_epi64 // CHECK: @llvm.x86.avx2.psrl.q // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} - return _mm256_maskz_srl_epi64(__U, __A, __B); + return _mm256_maskz_srl_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v2di(_mm_mask_srl_epi64((__m128i)(__v2di){99, 99}, 0x1, (__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){1, 0}), 9223372036854775807, 99)); +TEST_CONSTEXPR(match_v2di(_mm_maskz_srl_epi64(0x2, (__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){1, 0}), 0, 2)); +TEST_CONSTEXPR(match_v4di(_mm256_mask_srl_epi64((__m256i)(__v4di){99, 99, 99, 99}, 0x5, (__m256i)(__v4di){0, -2, 4, -6}, (__m128i)(__v2di){1, 0}), 0, 99, 2, 99)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_srl_epi64(0xA, (__m256i)(__v4di){0, -2, 4, -6}, (__m128i)(__v2di){1, 0}), 0, 9223372036854775807, 0, 9223372036854775805)); __m128i test_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_srli_epi64 @@ -6917,8 +6925,12 @@ __m256i test_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_sll_epi32 // CHECK: @llvm.x86.avx2.psll.d // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} - return _mm256_maskz_sll_epi32(__U, __A, __B); + return _mm256_maskz_sll_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v4si(_mm_mask_sll_epi32((__m128i)(__v4si){99, 99, 99, 99}, 0x5, (__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 2, 99, 6, 99)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_sll_epi32(0xA, (__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 4, 0, 8)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_sll_epi32((__m256i)(__v8si){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 99, 2, 99, 6, 8, 99, 12, 99)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_sll_epi32(0xA5, (__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 4, 0, 0, 10, 0, 14)); __m128i test_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_slli_epi32 @@ -7007,8 +7019,12 @@ __m256i test_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_sll_epi64 // CHECK: @llvm.x86.avx2.psll.q // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} - return _mm256_maskz_sll_epi64(__U, __A, __B); + return _mm256_maskz_sll_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v2di(_mm_mask_sll_epi64((__m128i)(__v2di){99, 99}, 0x1, (__m128i)(__v2di){1, 2}, (__m128i)(__v2di){1, 0}), 2, 99)); +TEST_CONSTEXPR(match_v2di(_mm_maskz_sll_epi64(0x2, (__m128i)(__v2di){1, 2}, (__m128i)(__v2di){1, 0}), 0, 4)); +TEST_CONSTEXPR(match_v4di(_mm256_mask_sll_epi64((__m256i)(__v4di){99, 99, 99, 99}, 0x5, (__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 99, 4, 99)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_sll_epi64(0xA, (__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 2, 0, 6)); __m128i test_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_slli_epi64 @@ -8478,8 +8494,12 @@ __m256i test_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_sra_epi32 // CHECK: @llvm.x86.avx2.psra.d // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} - return _mm256_maskz_sra_epi32(__U, __A, __B); + return _mm256_maskz_sra_epi32(__U, __A, __B); } +TEST_CONSTEXPR(match_v4si(_mm_mask_sra_epi32((__m128i)(__v4si){99, 99, 99, 99}, 0x5, (__m128i)(__v4si){-2, 4, -6, 8}, (__m128i)(__v4si){1, 0, 0, 0}), -1, 99, -3, 99)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_sra_epi32(0xA, (__m128i)(__v4si){-2, 4, -6, 8}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 0, 4)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_sra_epi32((__m256i)(__v8si){99, 99, 99, 99, 99, 99, 99, 99}, 0x5A, (__m256i)(__v8si){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v4si){1, 0, 0, 0}), 99, -1, 99, -3, 4, 99, 6, 99)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_sra_epi32(0xA5, (__m256i)(__v8si){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 2, 0, 0, -5, 0, -7)); __m128i test_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_srai_epi32 @@ -8542,8 +8562,10 @@ __m256i test_mm256_maskz_srai_epi32_2(__mmask8 __U, __m256i __A, unsigned int __ __m128i test_mm_sra_epi64(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_sra_epi64 // CHECK: @llvm.x86.avx512.psra.q.128 - return _mm_sra_epi64(__A, __B); + return _mm_sra_epi64(__A, __B); } +TEST_CONSTEXPR(match_v2di(_mm_sra_epi64((__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){1, 0}), -1, 2)); +TEST_CONSTEXPR(match_v2di(_mm_sra_epi64((__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){64, 0}), -1, 0)); __m128i test_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_sra_epi64 @@ -8576,8 +8598,13 @@ __m256i test_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { // CHECK-LABEL: test_mm256_maskz_sra_epi64 // CHECK: @llvm.x86.avx512.psra.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} - return _mm256_maskz_sra_epi64(__U, __A, __B); + return _mm256_maskz_sra_epi64(__U, __A, __B); } +TEST_CONSTEXPR(match_v2di(_mm_mask_sra_epi64((__m128i)(__v2di){99, 99}, 0x1, (__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){1, 0}), -1, 99)); +TEST_CONSTEXPR(match_v2di(_mm_maskz_sra_epi64(0x2, (__m128i)(__v2di){-2, 4}, (__m128i)(__v2di){1, 0}), 0, 2)); +TEST_CONSTEXPR(match_v4di(_mm256_sra_epi64((__m256i)(__v4di){-2, 4, -6, 8}, (__m128i)(__v2di){1, 0}), -1, 2, -3, 4)); +TEST_CONSTEXPR(match_v4di(_mm256_mask_sra_epi64((__m256i)(__v4di){99, 99, 99, 99}, 0x5, (__m256i)(__v4di){0, -2, 4, -6}, (__m128i)(__v2di){1, 0}), 0, 99, 2, 99)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_sra_epi64(0xA, (__m256i)(__v4di){0, -2, 4, -6}, (__m128i)(__v2di){1, 0}), 0, -1, 0, -3)); __m128i test_mm_srai_epi64(__m128i __A) { // CHECK-LABEL: test_mm_srai_epi64 diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index efe8930fae336..c24e8f8fdf9b2 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1336,18 +1336,24 @@ __m128i test_mm_sll_epi16(__m128i A, __m128i B) { // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_sll_epi16(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 2, 4, 6, 8, 10, 12, 14, 16)); +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_sll_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sll_epi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_sll_epi32(A, B); } +TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 2, 4, 6, 8)); +TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0)); __m128i test_mm_sll_epi64(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sll_epi64 // CHECK: call {{.*}}<2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm_sll_epi64(A, B); } +TEST_CONSTEXPR(match_v2di(_mm_sll_epi64((__m128i)(__v2di){1, 2}, (__m128i)(__v2di){1, 0}), 2, 4)); +TEST_CONSTEXPR(match_v2di(_mm_sll_epi64((__m128i)(__v2di){1, 2}, (__m128i)(__v2di){64, 0}), 0, 0)); __m128i test_mm_slli_epi16(__m128i A) { // CHECK-LABEL: test_mm_slli_epi16 @@ -1452,12 +1458,16 @@ __m128i test_mm_sra_epi16(__m128i A, __m128i B) { // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_sra_epi16(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), -8, 8, -4, 4, -2, 2, -1, 1)); +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), -1, 0, -1, 0, -1, 0, -1, 0)); __m128i test_mm_sra_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sra_epi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_sra_epi32(A, B); } +TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){1, 0, 0, 0}), -8, 8, -4, 4)); +TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){32, 0, 0, 0}), -1, 0, -1, 0)); __m128i test_mm_srai_epi16(__m128i A) { // CHECK-LABEL: test_mm_srai_epi16 @@ -1503,18 +1513,24 @@ __m128i test_mm_srl_epi16(__m128i A, __m128i B) { // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_srl_epi16(A, B); } +TEST_CONSTEXPR(match_v8hu(_mm_srl_epi16((__m128i)(__v8hu){0x8000, 16, 8, 4, 2, 1, 0, 0xFFFF}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0x4000, 8, 4, 2, 1, 0, 0, 0x7FFF)); +TEST_CONSTEXPR(match_v8hi(_mm_srl_epi16((__m128i)(__v8hi){-1, 16, 8, 4, 2, 1, 0, -1}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_srl_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_srl_epi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_srl_epi32(A, B); } +TEST_CONSTEXPR(match_v4su(_mm_srl_epi32((__m128i)(__v4su){0x80000000, 16, 8, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 0x40000000, 8, 4, 2)); +TEST_CONSTEXPR(match_v4si(_mm_srl_epi32((__m128i)(__v4si){-1, 16, 8, 4}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0)); __m128i test_mm_srl_epi64(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_srl_epi64 // CHECK: call {{.*}}<2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm_srl_epi64(A, B); } +TEST_CONSTEXPR(match_v2du(_mm_srl_epi64((__m128i)(__v2du){0x8000000000000000ULL, 16}, (__m128i)(__v2di){1, 0}), 0x4000000000000000ULL, 8)); +TEST_CONSTEXPR(match_v2di(_mm_srl_epi64((__m128i)(__v2di){-1, 16}, (__m128i)(__v2di){64, 0}), 0, 0)); __m128i test_mm_srli_epi16(__m128i A) { // CHECK-LABEL: test_mm_srli_epi16 From 607f1ea60b57b347f2f09198e6ce9de1ec8b8100 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Mon, 24 Nov 2025 13:59:28 -0800 Subject: [PATCH 2/5] Make MMX shift intrinsics constexpr _mm_sll_pi16 _mm_sll_pi32 _mm_sll_si64 _mm_sra_pi16 _mm_sra_pi32 _mm_srl_pi16 _mm_srl_pi32 _mm_srl_si64 Also use _zext128 instead of _anyext128 to avoid negative indices. --- clang/lib/Headers/mmintrin.h | 332 ++++++++++++-------------- clang/test/CodeGen/X86/mmx-builtins.c | 16 ++ 2 files changed, 169 insertions(+), 179 deletions(-) diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index aca78e6986ad9..c2a02b1959e1d 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -39,14 +39,14 @@ typedef short __v8hi __attribute__((__vector_size__(16))); typedef char __v16qi __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ - __min_vector_width__(128))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr + __min_vector_width__(128))) constexpr #else -#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 +#define __DEFAULT_FN_ATTRS_SSE2 \ + __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ + __min_vector_width__(128))) #endif #define __trunc64(x) \ @@ -82,10 +82,8 @@ static __inline__ void /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi32_si64(int __i) -{ - return __extension__ (__m64)(__v2si){__i, 0}; +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) { + return __extension__(__m64)(__v2si){__i, 0}; } /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit @@ -99,10 +97,8 @@ _mm_cvtsi32_si64(int __i) /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi64_si32(__m64 __m) -{ - return ((__v2si)__m)[0]; +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) { + return ((__v2si)__m)[0]; } /// Casts a 64-bit signed integer value into a 64-bit integer vector. @@ -115,10 +111,8 @@ _mm_cvtsi64_si32(__m64 __m) /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtsi64_m64(long long __i) -{ - return __extension__ (__m64)(__v1di){__i}; +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) { + return __extension__(__m64)(__v1di){__i}; } /// Casts a 64-bit integer vector into a 64-bit signed integer value. @@ -131,10 +125,8 @@ _mm_cvtsi64_m64(long long __i) /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cvtm64_si64(__m64 __m) -{ - return ((__v1di)__m)[0]; +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) { + return ((__v1di)__m)[0]; } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -156,8 +148,8 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packsswb128( (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } @@ -181,8 +173,8 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) { /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packssdw128( (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } @@ -206,8 +198,8 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) { /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_packs_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_packuswb128( (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } @@ -233,8 +225,8 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) { /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 12, 5, 13, 6, 14, 7, 15); } @@ -256,8 +248,8 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 6, 3, 7); } @@ -276,8 +268,8 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } @@ -302,8 +294,8 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8, 1, 9, 2, 10, 3, 11); } @@ -325,8 +317,8 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4, 1, 5); } @@ -345,8 +337,8 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } @@ -365,10 +357,9 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); } /// Adds each 16-bit integer element of the first 64-bit integer vector @@ -386,10 +377,9 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); } /// Adds each 32-bit integer element of the first 64-bit integer vector @@ -407,10 +397,9 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_add_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); } /// Adds, with saturation, each 8-bit signed integer element of the first @@ -431,8 +420,8 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v8qs)__m1, (__v8qs)__m2); } @@ -454,8 +443,8 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v4hi)__m1, (__v4hi)__m2); } @@ -476,8 +465,8 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pu8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v8qu)__m1, (__v8qu)__m2); } @@ -498,8 +487,8 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_adds_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_add_sat((__v4hu)__m1, (__v4hu)__m2); } @@ -518,10 +507,9 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); } /// Subtracts each 16-bit integer element of the second 64-bit integer @@ -539,10 +527,9 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); } /// Subtracts each 32-bit integer element of the second 64-bit integer @@ -560,10 +547,9 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_sub_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); } /// Subtracts, with saturation, each 8-bit signed integer element of the second @@ -584,8 +570,8 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pi8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v8qs)__m1, (__v8qs)__m2); } @@ -607,8 +593,8 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v4hi)__m1, (__v4hi)__m2); } @@ -630,8 +616,8 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pu8(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v8qu)__m1, (__v8qu)__m2); } @@ -653,8 +639,8 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_subs_pu16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, + __m64 __m2) { return (__m64)__builtin_elementwise_sub_sat((__v4hu)__m1, (__v4hu)__m2); } @@ -679,8 +665,8 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_madd_pi16(__m64 __m1, __m64 __m2) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, + __m64 __m2) { return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1), (__v8hi)__zext128(__m2))); } @@ -700,11 +686,10 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) { /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_mulhi_pi16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), - (__v8hi)__zext128(__m2))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, + __m64 __m2) { + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -722,10 +707,9 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_mullo_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); } /// Left-shifts each 16-bit signed integer element of the first @@ -748,8 +732,8 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psllw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -768,8 +752,8 @@ _mm_sll_pi16(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psllwi128((__v8hi)__zext128(__m), __count)); } @@ -793,8 +777,8 @@ _mm_slli_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_pslld128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -813,8 +797,8 @@ _mm_sll_pi32(__m64 __m, __m64 __count) /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_pslldi128((__v4si)__zext128(__m), __count)); } @@ -835,8 +819,8 @@ _mm_slli_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), - (__v2di)__anyext128(__count))); + return __trunc64(__builtin_ia32_psllq128((__v2di)__zext128(__m), + (__v2di)__zext128(__count))); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -853,8 +837,8 @@ _mm_sll_si64(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_slli_si64(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psllqi128((__v2di)__zext128(__m), __count)); } @@ -879,8 +863,8 @@ _mm_slli_si64(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psraw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -900,8 +884,8 @@ _mm_sra_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srai_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrawi128((__v8hi)__zext128(__m), __count)); } @@ -926,8 +910,8 @@ _mm_srai_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrad128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -947,8 +931,8 @@ _mm_sra_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srai_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psradi128((__v4si)__zext128(__m), __count)); } @@ -972,8 +956,8 @@ _mm_srai_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), - (__v8hi)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrlw128((__v8hi)__zext128(__m), + (__v8hi)__zext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -992,8 +976,8 @@ _mm_srl_pi16(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_pi16(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__zext128(__m), __count)); } @@ -1017,8 +1001,8 @@ _mm_srli_pi16(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), - (__v4si)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrld128((__v4si)__zext128(__m), + (__v4si)__zext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1037,8 +1021,8 @@ _mm_srl_pi32(__m64 __m, __m64 __count) /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_pi32(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrldi128((__v4si)__zext128(__m), __count)); } @@ -1059,8 +1043,8 @@ _mm_srli_pi32(__m64 __m, int __count) { static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count) { - return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), - (__v2di)__anyext128(__count))); + return __trunc64(__builtin_ia32_psrlq128((__v2di)__zext128(__m), + (__v2di)__zext128(__count))); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1078,8 +1062,8 @@ _mm_srl_si64(__m64 __m, __m64 __count) /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_srli_si64(__m64 __m, int __count) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, + int __count) { return __trunc64(__builtin_ia32_psrlqi128((__v2di)__zext128(__m), __count)); } @@ -1095,10 +1079,9 @@ _mm_srli_si64(__m64 __m, int __count) { /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_and_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1116,10 +1099,9 @@ _mm_and_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_andnot_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1134,10 +1116,9 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_or_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1152,10 +1133,9 @@ _mm_or_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_xor_si64(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1174,10 +1154,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1196,10 +1175,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1218,10 +1196,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1240,9 +1217,8 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) -{ +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, + __m64 __m2) { /* This function always performs a signed comparison, but __v8qi is a char which may be signed or unsigned, so use __v8qs. */ return (__m64)((__v8qs)__m1 > (__v8qs)__m2); @@ -1264,10 +1240,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)((__v4hi)__m1 > (__v4hi)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, + __m64 __m2) { + return (__m64)((__v4hi)__m1 > (__v4hi)__m2); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1286,10 +1261,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)((__v2si)__m1 > (__v2si)__m2); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, + __m64 __m2) { + return (__m64)((__v2si)__m1 > (__v2si)__m2); } /// Constructs a 64-bit integer vector initialized to zero. @@ -1299,8 +1273,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) /// This intrinsic corresponds to the PXOR instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setzero_si64(void) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void) { return __extension__(__m64){0LL}; } @@ -1319,8 +1292,8 @@ _mm_setzero_si64(void) { /// A 32-bit integer value used to initialize the lower 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set_pi32(int __i1, int __i0) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi32(int __i1, + int __i0) { return __extension__(__m64)(__v2si){__i0, __i1}; } @@ -1341,8 +1314,10 @@ _mm_set_pi32(int __i1, int __i0) { /// \param __s0 /// A 16-bit integer value used to initialize bits [15:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi16(short __s3, + short __s2, + short __s1, + short __s0) { return __extension__(__m64)(__v4hi){__s0, __s1, __s2, __s3}; } @@ -1371,7 +1346,7 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { /// \param __b0 /// An 8-bit integer value used to initialize bits [7:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { return __extension__(__m64)(__v8qi){__b0, __b1, __b2, __b3, @@ -1391,8 +1366,7 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, /// A 32-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi32(int __i) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); } @@ -1409,8 +1383,7 @@ _mm_set1_pi32(int __i) { /// A 16-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi16(short __w) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); } @@ -1426,8 +1399,7 @@ _mm_set1_pi16(short __w) { /// An 8-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_set1_pi8(char __b) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } @@ -1446,8 +1418,8 @@ _mm_set1_pi8(char __b) { /// A 32-bit integer value used to initialize the upper 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setr_pi32(int __i0, int __i1) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi32(int __i0, + int __i1) { return _mm_set_pi32(__i1, __i0); } @@ -1468,8 +1440,10 @@ _mm_setr_pi32(int __i0, int __i1) { /// \param __w3 /// A 16-bit integer value used to initialize bits [63:48] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR -_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi16(short __w0, + short __w1, + short __w2, + short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); } @@ -1498,7 +1472,7 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { /// \param __b7 /// An 8-bit integer value used to initialize bits [63:56] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7) { return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index ad8a81c61ad43..37d6306ecdb7d 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -632,18 +632,24 @@ __m64 test_mm_sll_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w( return _mm_sll_pi16(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_sll_pi16((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){1, 0, 0, 0}), 2, 4, 6, 8)); +TEST_CONSTEXPR(match_v4hi(_mm_sll_pi16((__m64)(__v4hi){1, 2, 3, 4}, (__m64)(__v4hi){16, 0, 0, 0}), 0, 0, 0, 0)); __m64 test_mm_sll_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_pi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d( return _mm_sll_pi32(a, b); } +TEST_CONSTEXPR(match_v2si(_mm_sll_pi32((__m64)(__v2si){1, 2}, (__m64)(__v2si){1, 0}), 2, 4)); +TEST_CONSTEXPR(match_v2si(_mm_sll_pi32((__m64)(__v2si){1, 2}, (__m64)(__v2si){32, 0}), 0, 0)); __m64 test_mm_sll_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_si64 // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q( return _mm_sll_si64(a, b); } +TEST_CONSTEXPR(match_v1di(_mm_sll_si64((__m64)(__v1di){1}, (__m64)(__v1di){1}), 2)); +TEST_CONSTEXPR(match_v1di(_mm_sll_si64((__m64)(__v1di){1}, (__m64)(__v1di){64}), 0)); __m64 test_mm_slli_pi16(__m64 a) { // CHECK-LABEL: test_mm_slli_pi16 @@ -685,12 +691,16 @@ __m64 test_mm_sra_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w( return _mm_sra_pi16(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_sra_pi16((__m64)(__v4hi){-16, 16, -8, 8}, (__m64)(__v4hi){1, 0, 0, 0}), -8, 8, -4, 4)); +TEST_CONSTEXPR(match_v4hi(_mm_sra_pi16((__m64)(__v4hi){-16, 16, -8, 8}, (__m64)(__v4hi){16, 0, 0, 0}), -1, 0, -1, 0)); __m64 test_mm_sra_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sra_pi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d( return _mm_sra_pi32(a, b); } +TEST_CONSTEXPR(match_v2si(_mm_sra_pi32((__m64)(__v2si){-16, 16}, (__m64)(__v2si){1, 0}), -8, 8)); +TEST_CONSTEXPR(match_v2si(_mm_sra_pi32((__m64)(__v2si){-16, 16}, (__m64)(__v2si){32, 0}), -1, 0)); __m64 test_mm_srai_pi16(__m64 a) { // CHECK-LABEL: test_mm_srai_pi16 @@ -722,18 +732,24 @@ __m64 test_mm_srl_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w( return _mm_srl_pi16(a, b); } +TEST_CONSTEXPR(match_v4hu(_mm_srl_pi16((__m64)(__v4hu){0x8000, 16, 8, 4}, (__m64)(__v4hi){1, 0, 0, 0}), 0x4000, 8, 4, 2)); +TEST_CONSTEXPR(match_v4hi(_mm_srl_pi16((__m64)(__v4hi){-1, 16, 8, 4}, (__m64)(__v4hi){16, 0, 0, 0}), 0, 0, 0, 0)); __m64 test_mm_srl_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_pi32 // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d( return _mm_srl_pi32(a, b); } +TEST_CONSTEXPR(match_v2su(_mm_srl_pi32((__m64)(__v2su){0x80000000, 16}, (__m64)(__v2si){1, 0}), 0x40000000, 8)); +TEST_CONSTEXPR(match_v2si(_mm_srl_pi32((__m64)(__v2si){-1, 16}, (__m64)(__v2si){32, 0}), 0, 0)); __m64 test_mm_srl_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_si64 // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q( return _mm_srl_si64(a, b); } +TEST_CONSTEXPR(match_v1du(_mm_srl_si64((__m64)(__v1du){0x8000000000000000ULL}, (__m64)(__v1di){1}), 0x4000000000000000ULL)); +TEST_CONSTEXPR(match_v1di(_mm_srl_si64((__m64)(__v1di){-1}, (__m64)(__v1di){64}), 0)); __m64 test_mm_srli_pi16(__m64 a) { // CHECK-LABEL: test_mm_srli_pi16 From bfe01c05df583876ce196d2618c0d7edaf70acfc Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Tue, 25 Nov 2025 05:16:09 -0800 Subject: [PATCH 3/5] Remove unused _anyext128 macro --- clang/lib/Headers/mmintrin.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index c2a02b1959e1d..4829f9ec69cb8 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -54,9 +54,6 @@ typedef char __v16qi __attribute__((__vector_size__(16))); #define __zext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, 2, 3) -#define __anyext128(x) \ - (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ - 1, -1, -1) /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. From 2a194b28b17ed9bd169b05cc8217aab8878fd849 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Wed, 26 Nov 2025 11:42:33 -0800 Subject: [PATCH 4/5] Add more tests --- clang/test/CodeGen/X86/avx2-builtins.c | 20 ++++++++++++++++++++ clang/test/CodeGen/X86/avx512bw-builtins.c | 12 ++++++++++++ clang/test/CodeGen/X86/avx512f-builtins.c | 10 +++++++++- clang/test/CodeGen/X86/sse2-builtins.c | 20 ++++++++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 398bf0c90105f..d6facfea8962e 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1198,6 +1198,10 @@ __m256i test_mm256_sll_epi16(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sll_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); __m256i test_mm256_sll_epi32(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_sll_epi32 @@ -1206,6 +1210,8 @@ __m256i test_mm256_sll_epi32(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14)); TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_sll_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 1, 1}), 0, 2, 4, 6, 8, 10, 12, 14)); __m256i test_mm256_sll_epi64(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_sll_epi64 @@ -1214,6 +1220,7 @@ __m256i test_mm256_sll_epi64(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v4di(_mm256_sll_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 2, 4, 6)); TEST_CONSTEXPR(match_v4di(_mm256_sll_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4di(_mm256_sll_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 1}), 0, 2, 4, 6)); __m256i test_mm256_slli_epi16(__m256i a) { // CHECK-LABEL: test_mm256_slli_epi16 @@ -1309,6 +1316,10 @@ __m256i test_mm256_sra_epi16(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), -16384, 16383, -2, -1, -1, 0, 0, 1, -16384, 16383, -2, -1, -1, 0, 0, 1)); TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0, -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_sra_epi16((__m256i)(__v16hi){-32768, 32767, -3, -2, -1, 0, 1, 2, -32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), -16384, 16383, -2, -1, -1, 0, 0, 1, -16384, 16383, -2, -1, -1, 0, 0, 1)); __m256i test_mm256_sra_epi32(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_sra_epi32 @@ -1317,6 +1328,8 @@ __m256i test_mm256_sra_epi32(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){1, 0, 0, 0}), -16384, 16383, -2, -1, -1, 0, 0, 1)); TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){32, 0, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){0, 1, 0, 0}), -1, 0, -1, -1, -1, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_sra_epi32((__m256i)(__v8si){-32768, 32767, -3, -2, -1, 0, 1, 2}, (__m128i)(__v4si){1, 0, 1, 1}), -16384, 16383, -2, -1, -1, 0, 0, 1)); __m256i test_mm256_srai_epi16(__m256i a) { // CHECK-LABEL: test_mm256_srai_epi16 @@ -1365,6 +1378,10 @@ __m256i test_mm256_srl_epi16(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)); TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16hi(_mm256_srl_epi16((__m256i)(__v16hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)); __m256i test_mm256_srl_epi32(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_srl_epi32 @@ -1373,6 +1390,8 @@ __m256i test_mm256_srl_epi32(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 0, 1, 1, 2, 2, 3, 3)); TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_srl_epi32((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v4si){1, 0, 1, 1}), 0, 0, 1, 1, 2, 2, 3, 3)); __m256i test_mm256_srl_epi64(__m256i a, __m128i b) { // CHECK-LABEL: test_mm256_srl_epi64 @@ -1381,6 +1400,7 @@ __m256i test_mm256_srl_epi64(__m256i a, __m128i b) { } TEST_CONSTEXPR(match_v4di(_mm256_srl_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 0}), 0, 0, 1, 1)); TEST_CONSTEXPR(match_v4di(_mm256_srl_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4di(_mm256_srl_epi64((__m256i)(__v4di){0, 1, 2, 3}, (__m128i)(__v2di){1, 1}), 0, 0, 1, 1)); __m256i test_mm256_srli_epi16(__m256i a) { // CHECK-LABEL: test_mm256_srli_epi16 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 5a653cb3fa0a3..c9c30dab389db 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -2302,6 +2302,10 @@ __m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62)); TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_sll_epi16((__m512i)(__v32hi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62)); __m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi16 @@ -2432,6 +2436,10 @@ __m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31)); TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v32hi(_mm512_sra_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31)); __m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi16 @@ -2500,6 +2508,10 @@ __m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0, 32767, 2, 32765, 4, 32763, 6, 32761, 8, 32759, 10, 32757, 12, 32755, 14, 32753, 16, 32751, 18, 32749, 20, 32747, 22, 32745, 24, 32743, 26, 32741, 28, 32739, 30, 32737)); TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){17, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v32hi(_mm512_srl_epi16((__m512i)(__v32hi){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30, 32, -34, 36, -38, 40, -42, 44, -46, 48, -50, 52, -54, 56, -58, 60, -62}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0, 32767, 2, 32765, 4, 32763, 6, 32761, 8, 32759, 10, 32757, 12, 32755, 14, 32753, 16, 32751, 18, 32749, 20, 32747, 22, 32745, 24, 32743, 26, 32741, 28, 32739, 30, 32737)); __m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 9b1dae2228eb4..6401a0e55a83b 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6287,6 +6287,8 @@ __m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){33, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_sll_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m128i)(__v4si){1, 0, 1, 1}), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)); __m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi32 @@ -6312,6 +6314,7 @@ __m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){1, 0}), 0, 2, 4, 6, 8, 10, 12, 14)); TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){65, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8di(_mm512_sll_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m128i)(__v2di){1, 1}), 0, 2, 4, 6, 8, 10, 12, 14)); __m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sll_epi64 @@ -6383,6 +6386,8 @@ __m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15)); TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){32, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){33, 0, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){0, 1, 0, 0}), 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v16si(_mm512_sra_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 1, 1}), 0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15)); __m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi32 @@ -6408,6 +6413,7 @@ __m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, -1, 2, -3, 4, -5, 6, -7)); TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){64, 0}), 0, -1, 0, -1, 0, -1, 0, -1)); TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){65, 0}), 0, -1, 0, -1, 0, -1, 0, -1)); +TEST_CONSTEXPR(match_v8di(_mm512_sra_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 1}), 0, -1, 2, -3, 4, -5, 6, -7)); __m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_sra_epi64 @@ -6479,6 +6485,8 @@ __m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) { TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 0, 0}), 0, 2147483647, 2, 2147483645, 4, 2147483643, 6, 2147483641, 8, 2147483639, 10, 2147483637, 12, 2147483635, 14, 2147483633)); TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){33, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16si(_mm512_srl_epi32((__m512i)(__v16si){0, -2, 4, -6, 8, -10, 12, -14, 16, -18, 20, -22, 24, -26, 28, -30}, (__m128i)(__v4si){1, 0, 1, 1}), 0, 2147483647, 2, 2147483645, 4, 2147483643, 6, 2147483641, 8, 2147483639, 10, 2147483637, 12, 2147483635, 14, 2147483633)); __m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi32 @@ -6503,7 +6511,7 @@ __m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) { } TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 0}), 0, 9223372036854775807, 2, 9223372036854775805, 4, 9223372036854775803, 6, 9223372036854775801)); TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){64, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); -TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){65, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8di(_mm512_srl_epi64((__m512i)(__v8di){0, -2, 4, -6, 8, -10, 12, -14}, (__m128i)(__v2di){1, 1}), 0, 9223372036854775807, 2, 9223372036854775805, 4, 9223372036854775803, 6, 9223372036854775801)); __m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { // CHECK-LABEL: test_mm512_mask_srl_epi64 diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index c24e8f8fdf9b2..ed1ac84b8c4a3 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1338,6 +1338,10 @@ __m128i test_mm_sll_epi16(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 2, 4, 6, 8, 10, 12, 14, 16)); TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sll_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 2, 4, 6, 8, 10, 12, 14, 16)); __m128i test_mm_sll_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sll_epi32 @@ -1346,6 +1350,8 @@ __m128i test_mm_sll_epi32(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 2, 4, 6, 8)); TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4si(_mm_sll_epi32((__m128i)(__v4si){1, 2, 3, 4}, (__m128i)(__v4si){1, 0, 1, 1}), 2, 4, 6, 8)); __m128i test_mm_sll_epi64(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sll_epi64 @@ -1354,6 +1360,7 @@ __m128i test_mm_sll_epi64(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v2di(_mm_sll_epi64((__m128i)(__v2di){1, 2}, (__m128i)(__v2di){1, 0}), 2, 4)); TEST_CONSTEXPR(match_v2di(_mm_sll_epi64((__m128i)(__v2di){1, 2}, (__m128i)(__v2di){64, 0}), 0, 0)); +TEST_CONSTEXPR(match_v2di(_mm_sll_epi64((__m128i)(__v2di){1, 2}, (__m128i)(__v2di){1, 1}), 2, 4)); __m128i test_mm_slli_epi16(__m128i A) { // CHECK-LABEL: test_mm_slli_epi16 @@ -1460,6 +1467,10 @@ __m128i test_mm_sra_epi16(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), -8, 8, -4, 4, -2, 2, -1, 1)); TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), -1, 0, -1, 0, -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), -1, 0, -1, 0, -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), -1, 0, -1, 0, -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), -1, 0, -1, 0, -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_sra_epi16((__m128i)(__v8hi){-16, 16, -8, 8, -4, 4, -2, 2}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), -8, 8, -4, 4, -2, 2, -1, 1)); __m128i test_mm_sra_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sra_epi32 @@ -1468,6 +1479,8 @@ __m128i test_mm_sra_epi32(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){1, 0, 0, 0}), -8, 8, -4, 4)); TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){32, 0, 0, 0}), -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){0, 1, 0, 0}), -1, 0, -1, 0)); +TEST_CONSTEXPR(match_v4si(_mm_sra_epi32((__m128i)(__v4si){-16, 16, -8, 8}, (__m128i)(__v4si){1, 0, 1, 1}), -8, 8, -4, 4)); __m128i test_mm_srai_epi16(__m128i A) { // CHECK-LABEL: test_mm_srai_epi16 @@ -1515,6 +1528,10 @@ __m128i test_mm_srl_epi16(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v8hu(_mm_srl_epi16((__m128i)(__v8hu){0x8000, 16, 8, 4, 2, 1, 0, 0xFFFF}, (__m128i)(__v8hi){1, 0, 0, 0, 0, 0, 0, 0}), 0x4000, 8, 4, 2, 1, 0, 0, 0x7FFF)); TEST_CONSTEXPR(match_v8hi(_mm_srl_epi16((__m128i)(__v8hi){-1, 16, 8, 4, 2, 1, 0, -1}, (__m128i)(__v8hi){16, 0, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_srl_epi16((__m128i)(__v8hi){-1, 16, 8, 4, 2, 1, 0, -1}, (__m128i)(__v8hi){0, 1, 0, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_srl_epi16((__m128i)(__v8hi){-1, 16, 8, 4, 2, 1, 0, -1}, (__m128i)(__v8hi){0, 0, 1, 0, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hi(_mm_srl_epi16((__m128i)(__v8hi){-1, 16, 8, 4, 2, 1, 0, -1}, (__m128i)(__v8hi){0, 0, 0, 1, 0, 0, 0, 0}), 0, 0, 0, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8hu(_mm_srl_epi16((__m128i)(__v8hu){0x8000, 16, 8, 4, 2, 1, 0, 0xFFFF}, (__m128i)(__v8hi){1, 0, 0, 0, 1, 1, 1, 1}), 0x4000, 8, 4, 2, 1, 0, 0, 0x7FFF)); __m128i test_mm_srl_epi32(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_srl_epi32 @@ -1523,6 +1540,8 @@ __m128i test_mm_srl_epi32(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v4su(_mm_srl_epi32((__m128i)(__v4su){0x80000000, 16, 8, 4}, (__m128i)(__v4si){1, 0, 0, 0}), 0x40000000, 8, 4, 2)); TEST_CONSTEXPR(match_v4si(_mm_srl_epi32((__m128i)(__v4si){-1, 16, 8, 4}, (__m128i)(__v4si){32, 0, 0, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4si(_mm_srl_epi32((__m128i)(__v4si){-1, 16, 8, 4}, (__m128i)(__v4si){0, 1, 0, 0}), 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v4su(_mm_srl_epi32((__m128i)(__v4su){0x80000000, 16, 8, 4}, (__m128i)(__v4si){1, 0, 1, 1}), 0x40000000, 8, 4, 2)); __m128i test_mm_srl_epi64(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_srl_epi64 @@ -1531,6 +1550,7 @@ __m128i test_mm_srl_epi64(__m128i A, __m128i B) { } TEST_CONSTEXPR(match_v2du(_mm_srl_epi64((__m128i)(__v2du){0x8000000000000000ULL, 16}, (__m128i)(__v2di){1, 0}), 0x4000000000000000ULL, 8)); TEST_CONSTEXPR(match_v2di(_mm_srl_epi64((__m128i)(__v2di){-1, 16}, (__m128i)(__v2di){64, 0}), 0, 0)); +TEST_CONSTEXPR(match_v2du(_mm_srl_epi64((__m128i)(__v2du){0x8000000000000000ULL, 16}, (__m128i)(__v2di){1, 1}), 0x4000000000000000ULL, 8)); __m128i test_mm_srli_epi16(__m128i A) { // CHECK-LABEL: test_mm_srli_epi16 From e0b578d919752d268c0ad51ac832d393ced3d2d6 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Thu, 27 Nov 2025 05:46:07 -0800 Subject: [PATCH 5/5] Remove undef for __anyext128 --- clang/lib/Headers/mmintrin.h | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 4829f9ec69cb8..2cf46455d7915 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -1475,7 +1475,6 @@ _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -#undef __anyext128 #undef __trunc64 #undef __DEFAULT_FN_ATTRS_SSE2