diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 0c85e280e748b..72e67d7dda3bc 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1746,75 +1746,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; + def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; + def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; + def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; + def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; + def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; + def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">; } -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d0b97a18e1815..f249a113a95ab 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3358,18 +3358,46 @@ static bool interp__builtin_ia32_shuffle_generic( GetSourceIndex) { assert(Call->getNumArgs() == 3); - unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + + unsigned ShuffleMask = 0; + Pointer A, MaskVector, B; + + QualType Arg2Type = Call->getArg(2)->getType(); + bool IsVectorMask = false; + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + B = S.Stk.pop(); + MaskVector = S.Stk.pop(); + A = S.Stk.pop(); + } else if (Arg2Type->isIntegerType()) { + ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + B = S.Stk.pop(); + A = S.Stk.pop(); + } else { + return false; + } QualType Arg0Type = Call->getArg(0)->getType(); const auto *VecT = Arg0Type->castAs(); PrimType ElemT = *S.getContext().classify(VecT->getElementType()); unsigned NumElems = VecT->getNumElements(); - const Pointer &B = S.Stk.pop(); - const Pointer &A = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); + PrimType MaskElemT = PT_Uint32; + if (IsVectorMask) { + QualType Arg1Type = Call->getArg(1)->getType(); + const auto *MaskVecT = Arg1Type->castAs(); + QualType MaskElemType = MaskVecT->getElementType(); + MaskElemT = *S.getContext().classify(MaskElemType); + } + for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { + if (IsVectorMask) { + INT_TYPE_SWITCH(MaskElemT, { + ShuffleMask = static_cast(MaskVector.elem(DstIdx)); + }); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); const Pointer &Src = (SrcVecIdx == 0) ? A : B; TYPE_SWITCH(ElemT, { Dst.elem(DstIdx) = Src.elem(SrcIdx); }); @@ -4345,6 +4373,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned Index = (ShuffleMask >> BitIndex) & IndexMask; return std::pair{SrcIdx, LaneOffset + Index}; }); + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 29ee089505125..1427005b9bd79 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric( if (!VT) return false; - APSInt MaskImm; - if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) - return false; - unsigned ShuffleMask = static_cast(MaskImm.getZExtValue()); + unsigned ShuffleMask = 0; + APValue A, MaskVector, B; + bool IsVectorMask = false; - APValue A, B; - if (!EvaluateAsRValue(Info, Call->getArg(0), A) || - !EvaluateAsRValue(Info, Call->getArg(1), B)) + QualType Arg2Type = Call->getArg(2)->getType(); + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) || + !EvaluateAsRValue(Info, Call->getArg(2), B)) + return false; + } else if (Arg2Type->isIntegerType()) { + APSInt MaskImm; + if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) + return false; + ShuffleMask = static_cast(MaskImm.getZExtValue()); + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), B)) + return false; + } else { return false; + } unsigned NumElts = VT->getNumElements(); - SmallVector ResultElements; + SmallVector ResultElements; ResultElements.reserve(NumElts); for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { + if (IsVectorMask) { + ShuffleMask = static_cast( + MaskVector.getVectorElt(DstIdx).getInt().getZExtValue()); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); const APValue &Src = (SrcVecIdx == 0) ? A : B; ResultElements.push_back(Src.getVectorElt(SrcIdx)); @@ -13048,6 +13065,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi512: { + APValue R; + if (!evalShuffleGeneric( + Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { + unsigned offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1 ? 1 : 0; + return std::pair{SrcIdx, offset}; + })) + return false; + return Success(R, E); + } } } diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 37ebc4f46a826..46ec12a63ef9c 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(512))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#endif + static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); } @@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { (__v32bf)__A); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); @@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh( (__v32bf)_mm512_setzero_pbh()); } +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #undef __DEFAULT_FN_ATTRS512 #endif diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 765cd682986b4..8fb8cd7cd0865 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); } @@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { (__v16bf)__A); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); @@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index ac75b6ccde735..aab1f2b61ab8a 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 18c4a44a4c76e..5fc0afa49ce4c 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) /* Vector permutations */ -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, (__v8di) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)_mm512_setzero_si512()); @@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, (__v8df)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)(__m512d)__I); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, (__v16sf) __B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)(__m512)__I); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)_mm512_setzero_ps()); } - #define _mm512_cvtt_roundpd_epu32(A, R) \ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 142cc079c2c4b..25051228f3e0a 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { (__v32hf)__A); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_ph(__m512i __A, __m512h __B) { return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index 964535c4c4900..84fda5c5849e8 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -19,59 +19,57 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W); @@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), (__v64qi)_mm512_setzero_si512()); } - - +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #undef __DEFAULT_FN_ATTRS - #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 4c50be7d9e7e5..58a48dadff863 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -24,117 +24,110 @@ __target__("avx512vbmi,avx512vl"), \ __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, (__v16qi)__I, (__v16qi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi8 (__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W); @@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) (__v32qi)_mm256_setzero_si256()); } - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 0fcfe3779fa19..a0990e2e0d19f 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index 5b2b3f0d0bbd4..885231b030b23 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { (__v16hf)__A); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_ph(__m128i __A, __m128h __B) { return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_ph(__m256i __A, __m256h __B) { return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 92bb444aeb5b8..e5249926b934e 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, (__v4si)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)_mm_setzero_si128()); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, (__v8si) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)_mm256_setzero_si256()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, (__v2df)__B); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)__A); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)(__m128d)__I); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, (__v4df)__B); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)__A); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)(__m256d)__I); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, (__v4sf)__B); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)__A); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)(__m128)__I); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, (__v8sf) __B); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), (__v8sf)__A); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)(__m256)__I); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, (__v2di)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)_mm_setzero_si128()); } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, (__v4di) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index be2cd480f7558..e6e2e38bcc097 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1591,6 +1591,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 400, 10, 410, 20, 420, 30, 430, + 40, 440, 50, 450, 60, 460, 70, 470, + 80, 480, 90, 490, 100, 500, 110, 510, + 120, 520, 130, 530, 140, 540, 150, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16( + (__m512i)(__v32hi){ + -1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, + -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + -1, 400, -3, 410, -5, 420, -7, 430, + -9, 440, -11, 450, -13, 460, -15, 470, + -17, 480, -19, 490, -21, 500, -23, 510, + -25, 520, -27, 530, -29, 540, -31, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_maskz_permutex2var_epi16( + 0x55555555, + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 0, 10, 0, 20, 0, 30, 0, + 40, 0, 50, 0, 60, 0, 70, 0, + 80, 0, 90, 0, 100, 0, 110, 0, + 120, 0, 130, 0, 140, 0, 150, 0)); + +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8( + (__m512i)(__v64qu){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125, + 124, 123, 122, 121, 120, 119, 118, 117, + 116, 115, 114, 113, 112, 111, 110, 109, + 108, 107, 106, 105, 104, 103, 102, 101, + 100, 99, 98, 97, 96, 95, 94, 93, + 92, 91, 90, 89, 88, 87, 86, 85, + 84, 83, 82, 81, 80, 79, 78, 77}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 210, 220, 230, 240, 250, 254, 253, + 252, 251, 250, 249, 248, 247, 246, 245, + 244, 243, 242, 241, 240, 239, 238, 237, + 236, 235, 234, 233, 232, 231, 230, 229, + 228, 227, 226, 225, 224, 223, 222, 221, + 220, 219, 218, 217, 216, 215, 214, 213, + 212, 211, 210, 209, 208, 207, 206, 205, + 204, 203, 202, 201, 200, 199, 198, 197}), + 0, 200, 10, 210, 20, 220, 30, 230, + 40, 240, 50, 250, 60, 254, 70, 253, + 80, 252, 90, 251, 100, 250, 110, 249, + 120, 248, 127, 247, 126, 246, 125, 245, + 124, 244, 123, 243, 122, 242, 121, 241, + 120, 240, 119, 239, 118, 238, 117, 237, + 116, 236, 115, 235, 114, 234, 113, 233, + 112, 232, 111, 231, 110, 230, 109, 229)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask2_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + 0x55555555, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 32, 10, 33, 20, 34, 30, 35, + 40, 36, 50, 37, 60, 38, 70, 39, + 80, 40, 90, 41, 100, 42, 110, 43, + 120, 44, 130, 45, 140, 46, 150, 47)); + __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mulhrs_epi16 // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 @@ -2578,6 +2704,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) { return _mm512_broadcastw_epi16(__A); } TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42)); +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 32, 101, 132, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107, + 8, 108, 9, 109, 10, 110, 11, 111, + 12, 112, 13, 113, 14, 114, 15, 115)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + -1, -32, -3, 132, -5, 102, -7, 103, + -9, 104, -11, 105, -13, 106, -15, 107, + -17, 108, -19, 109, -21, 110, -23, 111, + -25, 112, -27, 113, -29, 114, -31, 115)); __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm512_mask_broadcastw_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 69599379b6b3d..8e65430bd3e84 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _ // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B); } + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32( + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_mask_permutex2var_epi32( + (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32( + 0x5555, + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps( + (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd( + (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); + __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_testn_epi32_mask // CHECK: and <16 x i32> %{{.*}}, %{{.*}} @@ -11753,3 +11803,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _ // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512 _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2); } + + +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0)); + +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512( + _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f, + -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f, + -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f)); + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 16, 101, 116, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32(0x5555, + (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 0, 101, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0)); + +TEST_CONSTEXPR(match_v8di( + _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 108, 1, 8, 2, 2, 3, 3)); +TEST_CONSTEXPR(match_v8di( + _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + -1, 108, -3, -8, -5, -2, -7, -3)); diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c index c3b6298a39b59..7d506db92faeb 100644 --- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c +++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include +#include "builtin_test_helpers.h" __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8 @@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _ return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); } +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 200, 1, 201, 2, 202, 3, 203, + 4, 204, 5, 205, 6, 206, 7, 207, + 8, 208, 9, 209, 10, 210, 11, 211, + 12, 212, 13, 213, 14, 214, 15, 215, + 16, 216, 17, 217, 18, 218, 19, 219, + 20, 220, 21, 221, 22, 222, 23, 223, + 24, 224, 25, 225, 26, 226, 27, 227, + 28, 228, 29, 229, 30, 230, 31, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){ + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, + 66, 67, 68, 69, 70, 71, 72, 73}, + 0xAAAAAAAAAAAAAAAAULL, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 10, 200, 12, 201, 14, 202, 16, 203, + 18, 204, 20, 205, 22, 206, 24, 207, + 26, 208, 28, 209, 30, 210, 32, 211, + 34, 212, 36, 213, 38, 214, 40, 215, + 42, 216, 44, 217, 46, 218, 48, 219, + 50, 220, 52, 221, 54, 222, 56, 223, + 58, 224, 60, 225, 62, 226, 64, 227, + 66, 228, 68, 229, 70, 230, 72, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL, + (__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 0, 1, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0, + 8, 0, 9, 0, 10, 0, 11, 0, + 12, 0, 13, 0, 14, 0, 15, 0, + 16, 0, 17, 0, 18, 0, 19, 0, + 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, + 28, 0, 29, 0, 30, 0, 31, 0)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + 0x5555555555555555ULL, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95)); + __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_permutexvar_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c index c4d5fc8fb6977..49b7a1a721195 100644 --- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c +++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include +#include "builtin_test_helpers.h" __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_permutexvar_epi8 @@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, // CHECK-LABEL: test_mm_maskz_permutex2var_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108)); +TEST_CONSTEXPR(match_v16qu( + _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215}, + 0xAAAA, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108)); __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi8 @@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _ // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108, + 9, 109, 10, 110, 11, 111, 12, 112, + 13, 113, 14, 114, 15, 115, 16, 116)); +TEST_CONSTEXPR(match_v32qu( + _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231}, + 0xAAAAAAAA, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108, + 216, 109, 218, 110, 220, 111, 222, 112, + 224, 113, 226, 114, 228, 115, 230, 116)); __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 33c43977f72dc..121d5bf8d4adb 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, 0x05, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 3, 100, 6)); __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + 0xA5, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 7, 100, 15, 1, 110, 2, 120)); __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 @@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20}, + (__m128i)(__v2di){0, 5}, 0x1, + (__m128i)(__v2di){100, 200}), + 10, 5)); __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, + (__m256i)(__v4di){0, 1, 4, 5}, 0x5, + (__m256i)(__v4di){100, 110, 120, 130}), + 0, 1, 100, 5)); __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 return _mm_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 0, 40, 0, 300)); __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 return _mm256_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 return _mm_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 1.0, 10.0)); __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + -1.0, 10.0)); __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 0.0, 10.0)); __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 return _mm256_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 1.0, 10.0, 2.0, 20.0)); __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + -1.0, 10.0, -3.0, -4.0)); __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 0.0, 10.0, 0.0, 0.0)); __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 return _mm_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 1.f, 4.f, 10.f, 30.f)); __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + -1.f, -4.f, -3.f, 30.f)); __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 0.f, 4.f, 0.f, 30.f)); __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 return _mm256_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f)); __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f)); __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f)); __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 return _mm_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 10, 200)); __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + -1, 200)); __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 0, 200)); __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 return _mm256_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 10, 100, 110)); __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + -1, -2, 100, -4)); __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 0, 100, 0)); +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepi8_epi32 // CHECK: sext <4 x i8> %{{.*}} to <4 x i32> @@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) { TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4)); +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 8, 101, 108, 2, 102, 3, 103)); +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + -1, -8, -3, 108, -5, 102, -7, 103)); __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_mov_pd diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 116d86fcd597d..d3b76fecf3c55 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -1872,6 +1872,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); } + +TEST_CONSTEXPR(match_v8hi( + _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160, + 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_maskz_permutex2var_epi16(0xAA, + (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + 0x55, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 7, 100, 15, 10, 9, 20, 10)); +TEST_CONSTEXPR(match_v16hi( + _mm256_permutex2var_epi16( + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_mask_permutex2var_epi16( + (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_maskz_permutex2var_epi16( + 0x5555, + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); + __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_maddubs_epi16 // CHECK: @llvm.x86.ssse3.pmadd.ub.sw @@ -3558,3 +3619,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256 _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A); } + + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170, + 180, 190, 200, 210, 220, 230, 240, 250}), + 0, 100, 10, 110, 20, 120, 30, 130, + 40, 140, 50, 150, 60, 160, 70, 170)); +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}), + 0, 200, 10, 201, 20, 202, 30, 203, + 40, 204, 50, 205, 60, 206, 70, 207, + 80, 208, 90, 209, 100, 210, 110, 211, + 120, 212, 127, 213, 126, 214, 125, 215));