From a56119705ad5e317a3cd6cbc471f3d003ef157d4 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Wed, 12 Nov 2025 16:56:48 -0800 Subject: [PATCH 1/3] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX512 permutexvar intrinsics to be used in constexpr Resolves: #167476 --- clang/include/clang/Basic/BuiltinsX86.td | 23 ++++--- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 44 +++++++++++++ clang/lib/AST/ExprConstant.cpp | 64 +++++++++++++++++++ clang/lib/Headers/avx10_2_512bf16intrin.h | 2 +- clang/lib/Headers/avx10_2bf16intrin.h | 4 +- clang/lib/Headers/avx2intrin.h | 10 ++- clang/lib/Headers/avx512bwintrin.h | 18 +++--- clang/lib/Headers/avx512fintrin.h | 65 ++++++++------------ clang/lib/Headers/avx512vlbwintrin.h | 35 +++++------ clang/lib/Headers/avx512vlintrin.h | 54 +++++++--------- clang/test/CodeGen/X86/avx512bw-builtins.c | 13 +++- clang/test/CodeGen/X86/avx512f-builtins.c | 56 ++++++++++++++--- clang/test/CodeGen/X86/avx512vl-builtins.c | 37 +++++++++++ clang/test/CodeGen/X86/avx512vlbw-builtins.c | 25 ++++++-- 14 files changed, 313 insertions(+), 137 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 14c7d636ad51e..04470faf89030 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -603,6 +603,11 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">; } +let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; + def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; +} + let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; @@ -617,9 +622,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; - def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; - def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; } @@ -3042,38 +3045,38 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> def permdi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def permvarhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def permvardf512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>)">; def permvardi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">; def permvarsf512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">; def permvarsi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">; } -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def permvarqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def permvarqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def permvarqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def permvarhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def permvarhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def permvardf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">; def permvardi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 6c7b2f502cc51..c72a3566681b1 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4414,6 +4414,50 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::pair{0, static_cast(DstIdx)}; } }); + case X86::BI__builtin_ia32_permvarsi256: + case X86::BI__builtin_ia32_permvarsf256: + case X86::BI__builtin_ia32_permvardf512: + case X86::BI__builtin_ia32_permvardi512: + case X86::BI__builtin_ia32_permvarhi128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_permvarqi128: + case X86::BI__builtin_ia32_permvarhi256: + case X86::BI__builtin_ia32_permvarsi512: + case X86::BI__builtin_ia32_permvarsf512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_permvardi256: + case X86::BI__builtin_ia32_permvardf256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_permvarqi256: + case X86::BI__builtin_ia32_permvarhi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_permvarqi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + }); case X86::BI__builtin_ia32_vpermi2varq128: case X86::BI__builtin_ia32_vpermi2varpd128: return interp__builtin_ia32_shuffle_generic( diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 1bfea24b228e8..e9e448143477e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13551,6 +13551,70 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; return Success(R, E); } + case X86::BI__builtin_ia32_permvarsi256: + case X86::BI__builtin_ia32_permvarsf256: + case X86::BI__builtin_ia32_permvardf512: + case X86::BI__builtin_ia32_permvardi512: + case X86::BI__builtin_ia32_permvarhi128: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_permvarqi128: + case X86::BI__builtin_ia32_permvarhi256: + case X86::BI__builtin_ia32_permvarsi512: + case X86::BI__builtin_ia32_permvarsf512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_permvardi256: + case X86::BI__builtin_ia32_permvardf256: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_permvarqi256: + case X86::BI__builtin_ia32_permvarhi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_permvarqi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = 0; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } case X86::BI__builtin_ia32_vpermi2varq128: case X86::BI__builtin_ia32_vpermi2varpd128: { APValue R; diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 46ec12a63ef9c..3201307af4731 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -179,7 +179,7 @@ _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { (__v32hi)__B); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_pbh(__m512i __A, __m512bh __B) { return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 8fb8cd7cd0865..9f5b726d7b789 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -307,12 +307,12 @@ _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { (__v16hi)__B); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_pbh(__m128i __A, __m128bh __B) { return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_pbh(__m256i __A, __m256bh __B) { return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 3cbaaece7b38e..3e3c13d8bd662 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -3214,9 +3214,8 @@ _mm_broadcastq_epi64(__m128i __X) { /// A 256-bit vector of [8 x i32] containing indexes of values to use from /// \a __a. /// \returns A 256-bit vector of [8 x i32] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); } @@ -3272,9 +3271,8 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] containing indexes of values to use from /// \a __a. /// \returns A 256-bit vector of [8 x float] containing the result. -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) { return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 4a02c96620335..3cfa32eb9e727 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1846,25 +1846,21 @@ _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) (__v32hi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_permutexvar_epi16(__A, __B), (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_permutexvar_epi16(__A, __B), (__v32hi)__W); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 997e9608e112f..79c37173ac838 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -7959,93 +7959,82 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) (__v8di)_mm512_permutex_epi64((X), (C)), \ (__v8di)_mm512_setzero_si512())) -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_pd (__m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_pd(__m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, + __m512d __Y) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_permutexvar_pd(__X, __Y), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_permutexvar_epi64(__X, __Y), (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, (__v8di)_mm512_permutexvar_epi64(__X, __Y), (__v8di)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_ps (__m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_ps(__m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_permutexvar_ps(__X, __Y), (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); } #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_permutexvar_epi32(__X, __Y), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, - __m512i __Y) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_permutexvar_epi32(__X, __Y), (__v16si)__W); diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index d23188ab02b6c..88c3e1569bb2d 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -2624,48 +2624,41 @@ _mm_maskz_set1_epi16 (__mmask8 __M, short __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi16 (__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutexvar_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_permutexvar_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, (__v16hi)_mm256_permutexvar_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, (__v16hi)_mm256_permutexvar_epi16(__A, __B), (__v16hi)__W); diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index e5249926b934e..1e6e42df6b5fb 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -7804,47 +7804,41 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) (__v4di)_mm256_permutex_epi64((X), (C)), \ (__v4di)_mm256_setzero_si256())) -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_pd (__m256i __X, __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_pd(__m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, - __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) -{ +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_permutexvar_pd(__X, __Y), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_permutexvar_epi64(__X, __Y), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, (__v4di)_mm256_permutexvar_epi64(__X, __Y), (__v4di)__W); @@ -7852,17 +7846,15 @@ _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, #define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutexvar_ps(__X, __Y), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) -{ +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_permutexvar_ps(__X, __Y), (__v8sf)_mm256_setzero_ps()); @@ -7870,18 +7862,16 @@ _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) #define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ + __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_permutexvar_epi32(__X, __Y), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_permutexvar_epi32(__X, __Y), (__v8si)_mm256_setzero_si256()); diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 2749dc5741b58..45f1a600d380a 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -3036,9 +3036,12 @@ TEST_CONSTEXPR(match_v32hi(_mm512_maskz_set1_epi16((__mmask32)0xAAAAAAAA,42),0,4 __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.512 - return _mm512_permutexvar_epi16(__A, __B); + return _mm512_permutexvar_epi16(__A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_permutexvar_epi16((__m512i)(__v32hi){31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v32hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}), 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v32hi(_mm512_permutexvar_epi16((__m512i)(__v32hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, (__m512i)(__v32hi){100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131}), 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100)); + __m512i test_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.512 @@ -3050,8 +3053,14 @@ __m512i test_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __ // CHECK-LABEL: test_mm512_mask_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); + return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); } + +TEST_CONSTEXPR(match_v32hi(_mm512_mask_permutexvar_epi16((__m512i)(__v32hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xFFFFFFFF, (__m512i)(__v32hi){31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v32hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}), 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v32hi(_mm512_mask_permutexvar_epi16((__m512i)(__v32hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xAAAAAAAA, (__m512i)(__v32hi){31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v32hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}), 99, 40, 99, 38, 99, 36, 99, 34, 99, 32, 99, 30, 99, 28, 99, 26, 99, 24, 99, 22, 99, 20, 99, 18, 99, 16, 99, 14, 99, 12, 99, 10)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_permutexvar_epi16(0xFFFFFFFF, (__m512i)(__v32hi){31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v32hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}), 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_permutexvar_epi16(0xAAAAAAAA, (__m512i)(__v32hi){31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v32hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41}), 0, 40, 0, 38, 0, 36, 0, 34, 0, 32, 0, 30, 0, 28, 0, 26, 0, 24, 0, 22, 0, 20, 0, 18, 0, 16, 0, 14, 0, 12, 0, 10)); + __m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){ // CHECK-LABEL: test_mm512_alignr_epi8 // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 17778b52d3671..71e700af0069e 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -8874,23 +8874,32 @@ __m512i test_mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X) { __m512d test_mm512_permutexvar_pd(__m512i __X, __m512d __Y) { // CHECK-LABEL: test_mm512_permutexvar_pd // CHECK: @llvm.x86.avx512.permvar.df.512 - return _mm512_permutexvar_pd(__X, __Y); + return _mm512_permutexvar_pd(__X, __Y); } +TEST_CONSTEXPR(match_m512d(_mm512_permutexvar_pd((__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m512d(_mm512_permutexvar_pd((__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)); + __m512d test_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) { // CHECK-LABEL: test_mm512_mask_permutexvar_pd // CHECK: @llvm.x86.avx512.permvar.df.512 // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} - return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); + return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_permutexvar_pd((__m512d){9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0}, 0xFF, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m512d(_mm512_mask_permutexvar_pd((__m512d){9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0}, 0xAA, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 9.0, 6.0, 9.0, 4.0, 9.0, 2.0, 9.0, 0.0)); + __m512d test_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) { // CHECK-LABEL: test_mm512_maskz_permutexvar_pd // CHECK: @llvm.x86.avx512.permvar.df.512 // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} - return _mm512_maskz_permutexvar_pd(__U, __X, __Y); + return _mm512_maskz_permutexvar_pd(__U, __X, __Y); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_permutexvar_pd(0xFF, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m512d(_mm512_maskz_permutexvar_pd(0xAA, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512d){0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}), 0.0, 6.0, 0.0, 4.0, 0.0, 2.0, 0.0, 0.0)); + __m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_maskz_permutexvar_epi64 // CHECK: @llvm.x86.avx512.permvar.di.512 @@ -8901,36 +8910,54 @@ __m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __ __m512i test_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_permutexvar_epi64 // CHECK: @llvm.x86.avx512.permvar.di.512 - return _mm512_permutexvar_epi64(__X, __Y); + return _mm512_permutexvar_epi64(__X, __Y); } +TEST_CONSTEXPR(match_v8di(_mm512_permutexvar_epi64((__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8di(_mm512_permutexvar_epi64((__m512i)(__v8di){0, 1, 2, 3, 4, 5, 6, 7}, (__m512i)(__v8di){100, 101, 102, 103, 104, 105, 106, 107}), 100, 101, 102, 103, 104, 105, 106, 107)); + __m512i test_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_mask_permutexvar_epi64 // CHECK: @llvm.x86.avx512.permvar.di.512 // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} - return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); + return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_permutexvar_epi64((__m512i)(__v8di){99, 99, 99, 99, 99, 99, 99, 99}, 0xFF, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8di(_mm512_mask_permutexvar_epi64((__m512i)(__v8di){99, 99, 99, 99, 99, 99, 99, 99}, 0xAA, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}), 99, 16, 99, 14, 99, 12, 99, 10)); + +TEST_CONSTEXPR(match_v8di(_mm512_maskz_permutexvar_epi64(0xFF, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8di(_mm512_maskz_permutexvar_epi64(0xAA, (__m512i)(__v8di){7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v8di){10, 11, 12, 13, 14, 15, 16, 17}), 0, 16, 0, 14, 0, 12, 0, 10)); + __m512 test_mm512_permutexvar_ps(__m512i __X, __m512 __Y) { // CHECK-LABEL: test_mm512_permutexvar_ps // CHECK: @llvm.x86.avx512.permvar.sf.512 - return _mm512_permutexvar_ps(__X, __Y); + return _mm512_permutexvar_ps(__X, __Y); } +TEST_CONSTEXPR(match_m512(_mm512_permutexvar_ps((__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}), 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m512(_mm512_permutexvar_ps((__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, (__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)); + __m512 test_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { // CHECK-LABEL: test_mm512_mask_permutexvar_ps // CHECK: @llvm.x86.avx512.permvar.sf.512 // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} - return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); + return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); } +TEST_CONSTEXPR(match_m512(_mm512_mask_permutexvar_ps((__m512){99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f}, 0xFFFF, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}), 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m512(_mm512_mask_permutexvar_ps((__m512){99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f}, 0xAAAA, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}), 99.0f, 14.0f, 99.0f, 12.0f, 99.0f, 10.0f, 99.0f, 8.0f, 99.0f, 6.0f, 99.0f, 4.0f, 99.0f, 2.0f, 99.0f, 0.0f)); + __m512 test_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) { // CHECK-LABEL: test_mm512_maskz_permutexvar_ps // CHECK: @llvm.x86.avx512.permvar.sf.512 // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} - return _mm512_maskz_permutexvar_ps(__U, __X, __Y); + return _mm512_maskz_permutexvar_ps(__U, __X, __Y); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_permutexvar_ps(0xFFFF, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}), 15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m512(_mm512_maskz_permutexvar_ps(0xAAAA, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}), 0.0f, 14.0f, 0.0f, 12.0f, 0.0f, 10.0f, 0.0f, 8.0f, 0.0f, 6.0f, 0.0f, 4.0f, 0.0f, 2.0f, 0.0f, 0.0f)); + __m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_maskz_permutexvar_epi32 // CHECK: @llvm.x86.avx512.permvar.si.512 @@ -8941,16 +8968,25 @@ __m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i _ __m512i test_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_permutexvar_epi32 // CHECK: @llvm.x86.avx512.permvar.si.512 - return _mm512_permutexvar_epi32(__X, __Y); + return _mm512_permutexvar_epi32(__X, __Y); } +TEST_CONSTEXPR(match_v16si(_mm512_permutexvar_epi32((__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v16si){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16si(_mm512_permutexvar_epi32((__m512i)(__v16si){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, (__m512i)(__v16si){100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115}), 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115)); + __m512i test_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) { // CHECK-LABEL: test_mm512_mask_permutexvar_epi32 // CHECK: @llvm.x86.avx512.permvar.si.512 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} - return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); + return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_permutexvar_epi32((__m512i)(__v16si){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xFFFF, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v16si){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16si(_mm512_mask_permutexvar_epi32((__m512i)(__v16si){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xAAAA, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v16si){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 99, 24, 99, 22, 99, 20, 99, 18, 99, 16, 99, 14, 99, 12, 99, 10)); + +TEST_CONSTEXPR(match_v16si(_mm512_maskz_permutexvar_epi32(0xFFFF, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v16si){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_permutexvar_epi32(0xAAAA, (__m512i)(__v16si){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m512i)(__v16si){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 0, 24, 0, 22, 0, 20, 0, 18, 0, 16, 0, 14, 0, 12, 0, 10)); + __mmask16 test_mm512_kand(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: test_mm512_kand // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1> diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 121d5bf8d4adb..a7eee79c97539 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -10334,6 +10334,9 @@ __m256d test_mm256_permutexvar_pd(__m256i __X, __m256d __Y) { return _mm256_permutexvar_pd(__X, __Y); } +TEST_CONSTEXPR(match_m256d(_mm256_permutexvar_pd((__m256i)(__v4di){3, 2, 1, 0}, (__m256d){0.0, 1.0, 2.0, 3.0}), 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m256d(_mm256_permutexvar_pd((__m256i)(__v4di){0, 0, 0, 0}, (__m256d){1.0, 2.0, 3.0, 4.0}), 1.0, 1.0, 1.0, 1.0)); + __m256d test_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) { // CHECK-LABEL: test_mm256_mask_permutexvar_pd // CHECK: @llvm.x86.avx512.permvar.df.256 @@ -10348,6 +10351,11 @@ __m256d test_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) return _mm256_maskz_permutexvar_pd(__U, __X, __Y); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_permutexvar_pd((__m256d){9.0, 9.0, 9.0, 9.0}, 0xF, (__m256i)(__v4di){3, 2, 1, 0}, (__m256d){0.0, 1.0, 2.0, 3.0}), 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m256d(_mm256_mask_permutexvar_pd((__m256d){9.0, 9.0, 9.0, 9.0}, 0xA, (__m256i)(__v4di){3, 2, 1, 0}, (__m256d){0.0, 1.0, 2.0, 3.0}), 9.0, 2.0, 9.0, 0.0)); +TEST_CONSTEXPR(match_m256d(_mm256_maskz_permutexvar_pd(0xF, (__m256i)(__v4di){3, 2, 1, 0}, (__m256d){0.0, 1.0, 2.0, 3.0}), 3.0, 2.0, 1.0, 0.0)); +TEST_CONSTEXPR(match_m256d(_mm256_maskz_permutexvar_pd(0xA, (__m256i)(__v4di){3, 2, 1, 0}, (__m256d){0.0, 1.0, 2.0, 3.0}), 0.0, 2.0, 0.0, 0.0)); + __m256i test_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_maskz_permutexvar_epi64 // CHECK: @llvm.x86.avx512.permvar.di.256 @@ -10362,6 +10370,11 @@ __m256i test_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X return _mm256_mask_permutexvar_epi64(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v4di(_mm256_mask_permutexvar_epi64((__m256i)(__v4di){99, 99, 99, 99}, 0xF, (__m256i)(__v4di){3, 2, 1, 0}, (__m256i)(__v4di){10, 11, 12, 13}), 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v4di(_mm256_mask_permutexvar_epi64((__m256i)(__v4di){99, 99, 99, 99}, 0xA, (__m256i)(__v4di){3, 2, 1, 0}, (__m256i)(__v4di){10, 11, 12, 13}), 99, 12, 99, 10)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_permutexvar_epi64(0xF, (__m256i)(__v4di){3, 2, 1, 0}, (__m256i)(__v4di){10, 11, 12, 13}), 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v4di(_mm256_maskz_permutexvar_epi64(0xA, (__m256i)(__v4di){3, 2, 1, 0}, (__m256i)(__v4di){10, 11, 12, 13}), 0, 12, 0, 10)); + __m256 test_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { // CHECK-LABEL: test_mm256_mask_permutexvar_ps // CHECK: @llvm.x86.avx2.permps @@ -10381,6 +10394,19 @@ __m256 test_mm256_permutexvar_ps(__m256i __X, __m256 __Y) { return _mm256_permutexvar_ps( __X, __Y); } +TEST_CONSTEXPR(match_m256(_mm256_permutexvar_ps(((__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}), + ((__m256){0.0f, 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, 7.0f})), + 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m256(_mm256_permutexvar_ps(((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256){1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f})), + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f)); +TEST_CONSTEXPR(match_m256(_mm256_mask_permutexvar_ps((__m256){99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f}, 0xFF, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}), 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m256(_mm256_mask_permutexvar_ps((__m256){99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f}, 0xAA, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}), 99.0f, 6.0f, 99.0f, 4.0f, 99.0f, 2.0f, 99.0f, 0.0f)); +TEST_CONSTEXPR(match_m256(_mm256_maskz_permutexvar_ps(0xFF, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}), 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m256(_mm256_maskz_permutexvar_ps(0xAA, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256){0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}), 0.0f, 6.0f, 0.0f, 4.0f, 0.0f, 2.0f, 0.0f, 0.0f)); + __m256i test_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { // CHECK-LABEL: test_mm256_maskz_permutexvar_epi32 // CHECK: @llvm.x86.avx2.permd @@ -10401,6 +10427,17 @@ __m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X return _mm256_mask_permutexvar_epi32(__W, __M, __X, __Y); } +TEST_CONSTEXPR(match_v8si(_mm256_permutexvar_epi32(((__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}), + ((__m256i)(__v8si){10, 11, 12, 13, 14, 15, 16, 17})), + 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8si(_mm256_permutexvar_epi32(((__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}), + ((__m256i)(__v8si){100, 101, 102, 103, 104, 105, 106, 107})), + 100, 101, 102, 103, 104, 105, 106, 107)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_permutexvar_epi32((__m256i)(__v8si){99, 99, 99, 99, 99, 99, 99, 99}, 0xFF, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v8si){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_permutexvar_epi32((__m256i)(__v8si){99, 99, 99, 99, 99, 99, 99, 99}, 0xAA, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v8si){10, 11, 12, 13, 14, 15, 16, 17}), 99, 16, 99, 14, 99, 12, 99, 10)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_permutexvar_epi32(0xFF, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v8si){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_permutexvar_epi32(0xAA, (__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v8si){10, 11, 12, 13, 14, 15, 16, 17}), 0, 16, 0, 14, 0, 12, 0, 10)); + __m128i test_mm_alignr_epi32(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_alignr_epi32 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 7a5af2dc8742f..a262e83083cab 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -3496,9 +3496,12 @@ TEST_CONSTEXPR(match_v8hi(_mm_maskz_set1_epi16((__mmask8)0xAA,42),0,42,0,42,0,42 __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 - return _mm_permutexvar_epi16(__A, __B); + return _mm_permutexvar_epi16(__A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_permutexvar_epi16((__m128i)(__v8hi){7, 6, 5, 4, 3, 2, 1, 0}, (__m128i)(__v8hi){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8hi(_mm_permutexvar_epi16((__m128i)(__v8hi){0, 0, 0, 0, 0, 0, 0, 0}, (__m128i)(__v8hi){100, 101, 102, 103, 104, 105, 106, 107}), 100, 100, 100, 100, 100, 100, 100, 100)); + __m128i test_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 @@ -3510,15 +3513,23 @@ __m128i test_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, _ // CHECK-LABEL: test_mm_mask_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); + return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_permutexvar_epi16((__m128i)(__v8hi){99, 99, 99, 99, 99, 99, 99, 99}, 0xFF, (__m128i)(__v8hi){7, 6, 5, 4, 3, 2, 1, 0}, (__m128i)(__v8hi){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_permutexvar_epi16((__m128i)(__v8hi){99, 99, 99, 99, 99, 99, 99, 99}, 0xAA, (__m128i)(__v8hi){7, 6, 5, 4, 3, 2, 1, 0}, (__m128i)(__v8hi){10, 11, 12, 13, 14, 15, 16, 17}), 99, 16, 99, 14, 99, 12, 99, 10)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_permutexvar_epi16(0xFF, (__m128i)(__v8hi){7, 6, 5, 4, 3, 2, 1, 0}, (__m128i)(__v8hi){10, 11, 12, 13, 14, 15, 16, 17}), 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_permutexvar_epi16(0xAA, (__m128i)(__v8hi){7, 6, 5, 4, 3, 2, 1, 0}, (__m128i)(__v8hi){10, 11, 12, 13, 14, 15, 16, 17}), 0, 16, 0, 14, 0, 12, 0, 10)); + __m256i test_mm256_permutexvar_epi16(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 - return _mm256_permutexvar_epi16(__A, __B); + return _mm256_permutexvar_epi16(__A, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_permutexvar_epi16((__m256i)(__v16hi){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v16hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16hi(_mm256_permutexvar_epi16((__m256i)(__v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, (__m256i)(__v16hi){100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115}), 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100)); + __m256i test_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 @@ -3530,8 +3541,14 @@ __m256i test_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __ // CHECK-LABEL: test_mm256_mask_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); + return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); } + +TEST_CONSTEXPR(match_v16hi(_mm256_mask_permutexvar_epi16((__m256i)(__v16hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xFFFF, (__m256i)(__v16hi){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v16hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_permutexvar_epi16((__m256i)(__v16hi){99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}, 0xAAAA, (__m256i)(__v16hi){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v16hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 99, 24, 99, 22, 99, 20, 99, 18, 99, 16, 99, 14, 99, 12, 99, 10)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_permutexvar_epi16(0xFFFF, (__m256i)(__v16hi){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v16hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_permutexvar_epi16(0xAAAA, (__m256i)(__v16hi){15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v16hi){10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}), 0, 24, 0, 22, 0, 20, 0, 18, 0, 16, 0, 14, 0, 12, 0, 10)); + __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_alignr_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> From 6752579d327b50ff2b159adb61503aeb9b236c86 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Thu, 13 Nov 2025 03:31:25 -0800 Subject: [PATCH 2/3] Address code review comments - Group permvarsi256/permvarsf256 with other AVX2 constexpr builtins - Remove unnecessary SrcIdx variable and use zero directly in pair construction --- clang/include/clang/Basic/BuiltinsX86.td | 8 +++----- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 15 +++++---------- clang/lib/AST/ExprConstant.cpp | 15 +++++---------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 04470faf89030..69d18679fd6ec 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -603,11 +603,6 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">; } -let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { - def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; - def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; -} - let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; @@ -695,6 +690,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">; + + def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; + def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index c72a3566681b1..cee3c1b8cf8f3 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4422,8 +4422,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x7; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; }); case X86::BI__builtin_ia32_permvarqi128: case X86::BI__builtin_ia32_permvarhi256: @@ -4432,31 +4431,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0xF; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; }); case X86::BI__builtin_ia32_permvardi256: case X86::BI__builtin_ia32_permvardf256: return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x3; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; }); case X86::BI__builtin_ia32_permvarqi256: case X86::BI__builtin_ia32_permvarhi512: return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x1F; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; }); case X86::BI__builtin_ia32_permvarqi512: return interp__builtin_ia32_shuffle_generic( S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x3F; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; }); case X86::BI__builtin_ia32_vpermi2varq128: case X86::BI__builtin_ia32_vpermi2varpd128: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index e9e448143477e..b7da89ab3dcf2 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13560,8 +13560,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x7; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; })) return false; return Success(R, E); @@ -13574,8 +13573,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0xF; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; })) return false; return Success(R, E); @@ -13586,8 +13584,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x3; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; })) return false; return Success(R, E); @@ -13598,8 +13595,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x1F; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; })) return false; return Success(R, E); @@ -13609,8 +13605,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned ShuffleMask) { int Offset = ShuffleMask & 0x3F; - unsigned SrcIdx = 0; - return std::pair{SrcIdx, Offset}; + return std::pair{0, Offset}; })) return false; return Success(R, E); From 3502a040dedc958d2a2337ea81891bedf527b77c Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki Date: Thu, 13 Nov 2025 04:23:31 -0800 Subject: [PATCH 3/3] Add tests for permutevar8x32, permutevar8x32_epi32 --- clang/test/CodeGen/X86/avx2-builtins.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index ce8e2f04e487c..d22f2f8be8be3 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1119,12 +1119,16 @@ __m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) { // CHECK: call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_permutevar8x32_epi32(a, b); } +TEST_CONSTEXPR(match_v8si(_mm256_permutevar8x32_epi32((__m256i)(__v8si){7, 6, 5, 4, 3, 2, 1, 0}, (__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}), 7, 6, 5, 4, 3, 2, 1, 0)); +TEST_CONSTEXPR(match_v8si(_mm256_permutevar8x32_epi32((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, (__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 0, 0, 0, 0)); __m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) { // CHECK-LABEL: test_mm256_permutevar8x32_ps // CHECK: call {{.*}}<8 x float> @llvm.x86.avx2.permps(<8 x float> %{{.*}}, <8 x i32> %{{.*}}) return _mm256_permutevar8x32_ps(a, b); } +TEST_CONSTEXPR(match_m256(_mm256_permutevar8x32_ps((__m256){7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f}, (__m256i)(__v8si){0, 1, 2, 3, 4, 5, 6, 7}), 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +TEST_CONSTEXPR(match_m256(_mm256_permutevar8x32_ps((__m256){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, (__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)); __m256i test_mm256_sad_epu8(__m256i x, __m256i y) { // CHECK-LABEL: test_mm256_sad_epu8