diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index cb08e2107f072..da34d4123628f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1132,27 +1132,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5 } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { @@ -4325,12 +4325,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512> } let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in { - def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; - def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">; + def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">; + def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">; + def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">; + def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">; + def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">; } let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { @@ -4338,51 +4338,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512> } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in { - def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">; } let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { - def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">; } let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h index fdb57c7c9e27b..b2215b72c57bc 100644 --- a/clang/lib/Headers/avx10_2_512niintrin.h +++ b/clang/lib/Headers/avx10_2_512niintrin.h @@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32( @@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B, + (__v32hi)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32( @@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32( static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) { - return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B, - (__v16si)__C); + return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B, + (__v32hu)__C); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32( diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index a1a0338a69e0d..4b8a199af32e5 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -80,8 +80,8 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -98,8 +98,9 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpwssds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpwssds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A), \ + (__v16hi)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -157,8 +158,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -175,8 +176,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpwssds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpwssds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index c386923360de6..2ce88efe4a04f 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h index 805d249911c17..801795ac543dd 100644 --- a/clang/lib/Headers/avxvnniint16intrin.h +++ b/clang/lib/Headers/avxvnniint16intrin.h @@ -15,6 +15,7 @@ #ifndef __AVXVNNIINT16INTRIN_H #define __AVXVNNIINT16INTRIN_H +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -45,10 +46,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwsud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -79,10 +82,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwsud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -114,10 +119,13 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with #define _mm_dpwsuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -149,10 +157,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwsuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A), \ + (__v16hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -183,10 +193,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwusd_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hi)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -217,10 +229,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwusd_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hi)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -233,7 +247,7 @@ /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUSDS instruction. /// /// \param __W /// A 128-bit vector of [4 x int]. @@ -252,10 +266,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwusds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hi)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -268,7 +284,7 @@ /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUSDS instruction. /// /// \param __W /// A 256-bit vector of [8 x int]. @@ -287,10 +303,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwusds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hi)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -305,13 +323,13 @@ /// This intrinsic corresponds to the \c VPDPWUUD instruction. /// /// \param __W -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// \param __A /// A 128-bit vector of [8 x unsigned short]. /// \param __B /// A 128-bit vector of [8 x unsigned short]. /// \returns -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// /// \code{.operation} /// FOR j := 0 to 3 @@ -321,10 +339,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwuud_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -339,13 +359,13 @@ /// This intrinsic corresponds to the \c VPDPWUUD instruction. /// /// \param __W -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// \param __A /// A 256-bit vector of [16 x unsigned short]. /// \param __B /// A 256-bit vector of [16 x unsigned short]. /// \returns -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// /// \code{.operation} /// FOR j := 0 to 7 @@ -355,10 +375,12 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwuud_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -371,16 +393,16 @@ /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUUDS instruction. /// /// \param __W -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// \param __A /// A 128-bit vector of [8 x unsigned short]. /// \param __B /// A 128-bit vector of [8 x unsigned short]. /// \returns -/// A 128-bit vector of [4 x unsigned int]. +/// A 128-bit vector of [4 x int]. /// /// \code{.operation} /// FOR j := 0 to 3 @@ -390,10 +412,12 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode +// clang-format on #define _mm_dpwuuds_epi32(__W, __A, __B) \ - ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \ - (__v4si)(__B))) + ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A), \ + (__v8hu)(__B))) +// clang-format off /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate /// signed 16-bit results. Sum these 2 results with the corresponding @@ -406,16 +430,16 @@ /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) /// \endcode /// -/// This intrinsic corresponds to the \c VPDPWSUDS instruction. +/// This intrinsic corresponds to the \c VPDPWUUDS instruction. /// /// \param __W -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// \param __A /// A 256-bit vector of [16 x unsigned short]. /// \param __B /// A 256-bit vector of [16 x unsigned short]. /// \returns -/// A 256-bit vector of [8 x unsigned int]. +/// A 256-bit vector of [8 x int]. /// /// \code{.operation} /// FOR j := 0 to 7 @@ -425,8 +449,9 @@ /// ENDFOR /// dst[MAX:256] := 0 /// \endcode +// clang-format on #define _mm256_dpwuuds_epi32(__W, __A, __B) \ - ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \ - (__v8si)(__B))) + ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A), \ + (__v16hu)(__B))) #endif // __AVXVNNIINT16INTRIN_H diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h index 3c4c44a930fe2..1d2e8c906effc 100644 --- a/clang/lib/Headers/avxvnniintrin.h +++ b/clang/lib/Headers/avxvnniintrin.h @@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A, + (__v16hi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with @@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A, + (__v16hi)__B); } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with @@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A, + (__v8hi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with @@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A, + (__v8hi)__B); } #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c index 728c9f5652dd9..1ba6e87653a74 100644 --- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c @@ -176,20 +176,20 @@ __m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A, /* VNNI INT16 */ __m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwsud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwsud_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwsud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C); @@ -197,20 +197,20 @@ __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _ __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwsuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwsuds_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C); @@ -218,20 +218,20 @@ __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwusd_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwusd_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwusd_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C); @@ -239,20 +239,20 @@ __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, _ __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwusds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwusds_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwusds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C); @@ -260,20 +260,20 @@ __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwuud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwuud_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwuud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C); @@ -281,20 +281,20 @@ __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _ __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_dpwuuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwuuds_epi32(__A, __B, __C); } __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { // CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D); } __m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) { // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32( -// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) +// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C); diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c index a250d91ae5989..be2719c33c52f 100644 --- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c @@ -259,168 +259,168 @@ __m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, _ // VNNI INT16 __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwsud_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwsud_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwsud_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C); } __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwsuds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C); } __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwusd_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwusd_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwusd_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C); } __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwusds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwusds_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwusds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C); } __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwuud_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwuud_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwuud_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C); } __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { // CHECK-LABEL: @test_mm_mask_dpwuuds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D); } __m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32( -// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) +// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C); } __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { // CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D); } __m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32( -// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) +// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C); } diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c index f63b5c6e73917..11dbd717a9f77 100644 --- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c @@ -47,41 +47,41 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpwssd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B); } __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B); } __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_epi32(__S, __A, __B); } __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpwssds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B); } __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B); } __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_epi32(__S, __A, __B); } @@ -127,41 +127,41 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpwssd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwssd_epi32(__S, __U, __A, __B); } __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpwssd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B); } __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_epi32(__S, __A, __B); } __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpwssds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwssds_epi32(__S, __U, __A, __B); } __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpwssds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B); } __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_epi32(__S, __A, __B); } diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c index afe80458e37cc..6b8465206eedb 100644 --- a/clang/test/CodeGen/X86/avx512vnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c @@ -47,41 +47,41 @@ __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpwssd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B); } __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B); } __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpwssd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwssd_epi32(__S, __A, __B); } __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpwssds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B); } __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B); } __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpwssds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwssds_epi32(__S, __A, __B); } diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c index 7948e0d57d9bf..6557a26807eb2 100644 --- a/clang/test/CodeGen/X86/avxvnni-builtins.c +++ b/clang/test/CodeGen/X86/avxvnni-builtins.c @@ -19,13 +19,13 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_epi32(__S, __A, __B); } __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_epi32(__S, __A, __B); } @@ -43,13 +43,13 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_epi32(__S, __A, __B); } __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_epi32(__S, __A, __B); } @@ -67,13 +67,13 @@ __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_avx_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_avx_epi32(__S, __A, __B); } __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_avx_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_avx_epi32(__S, __A, __B); } @@ -91,12 +91,12 @@ __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_avx_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_avx_epi32(__S, __A, __B); } __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_avx_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_avx_epi32(__S, __A, __B); } diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c index 941da9aa223b5..f28fff2c0cfec 100644 --- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c +++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c @@ -11,72 +11,72 @@ __m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwsud_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwsud_epi32(__A, __B, __C); } __m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwsud_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwsud_epi32(__A, __B, __C); } __m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwsuds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwsuds_epi32(__A, __B, __C); } __m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwsuds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwsuds_epi32(__A, __B, __C); } __m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwusd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwusd_epi32(__A, __B, __C); } __m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwusd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwusd_epi32(__A, __B, __C); } __m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwusds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwusds_epi32(__A, __B, __C); } __m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwusds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwusds_epi32(__A, __B, __C); } __m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwuud_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwuud_epi32(__A, __B, __C); } __m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwuud_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwuud_epi32(__A, __B, __C); } __m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) { // CHECK-LABEL: test_mm_dpwuuds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwuuds_epi32(__A, __B, __C); } __m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) { // CHECK-LABEL: test_mm256_dpwuuds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwuuds_epi32(__A, __B, __C); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 1dd23f60c7e1e..ec80ba3e1ee81 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1893,29 +1893,29 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vpdpwssd_128 : ClangBuiltin<"__builtin_ia32_vpdpwssd128">, - DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, - llvm_v4i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_256 : ClangBuiltin<"__builtin_ia32_vpdpwssd256">, - DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_512 : ClangBuiltin<"__builtin_ia32_vpdpwssd512">, - DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty, + llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_128 : ClangBuiltin<"__builtin_ia32_vpdpwssds128">, - DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, - llvm_v4i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_256 : ClangBuiltin<"__builtin_ia32_vpdpwssds256">, - DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_512 : ClangBuiltin<"__builtin_ia32_vpdpwssds512">, - DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty, + llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpbssd_128 : ClangBuiltin<"__builtin_ia32_vpdpbssd128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], @@ -1980,62 +1980,62 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_vpdpwsud_128 : ClangBuiltin<"__builtin_ia32_vpdpwsud128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwsud_256 : ClangBuiltin<"__builtin_ia32_vpdpwsud256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwsuds_128 : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwsuds_256 : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwusd_128 : ClangBuiltin<"__builtin_ia32_vpdpwusd128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwusd_256 : ClangBuiltin<"__builtin_ia32_vpdpwusd256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwusds_128 : ClangBuiltin<"__builtin_ia32_vpdpwusds128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwusds_256 : ClangBuiltin<"__builtin_ia32_vpdpwusds256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwuud_128 : ClangBuiltin<"__builtin_ia32_vpdpwuud128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwuud_256 : ClangBuiltin<"__builtin_ia32_vpdpwuud256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwuuds_128 : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_vpdpwuuds_256 : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">, DefaultAttrsIntrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], + [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; } @@ -5031,32 +5031,32 @@ let TargetPrefix = "x86" in { def int_x86_avx10_vpdpwsud_512 : ClangBuiltin<"__builtin_ia32_vpdpwsud512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx10_vpdpwsuds_512 : ClangBuiltin<"__builtin_ia32_vpdpwsuds512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx10_vpdpwusd_512 : ClangBuiltin<"__builtin_ia32_vpdpwusd512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx10_vpdpwusds_512 : ClangBuiltin<"__builtin_ia32_vpdpwusds512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx10_vpdpwuud_512 : ClangBuiltin<"__builtin_ia32_vpdpwuud512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx10_vpdpwuuds_512 : ClangBuiltin<"__builtin_ia32_vpdpwuuds512">, DefaultAttrsIntrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; // VMPSADBW diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 58b7ddd0381e5..2ae65efc70da6 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -125,6 +125,24 @@ static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID, return true; } +// Upgrade the declaration of multipy and add words intrinsics whose input +// arguments' types have changed to vectors of i32 to vectors of i16 +static bool upgradeX86MultiplyAddWords(Function *F, Intrinsic::ID IID, + Function *&NewFn) { + // check if input argument type is a vector of i16 + Type *Arg1Type = F->getFunctionType()->getParamType(1); + Type *Arg2Type = F->getFunctionType()->getParamType(2); + if (Arg1Type->isVectorTy() && + cast(Arg1Type)->getElementType()->isIntegerTy(16) && + Arg2Type->isVectorTy() && + cast(Arg2Type)->getElementType()->isIntegerTy(16)) + return false; + + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); + return true; +} + static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID, Function *&NewFn) { if (F->getReturnType()->getScalarType()->isBFloatTy()) @@ -590,43 +608,89 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) return upgradeX86MultiplyAddBytes(F, ID, NewFn); + } else if (Name.starts_with("vpdpwssd.") || + Name.starts_with("vpdpwssds.")) { + // Added in 21.1 + ID = StringSwitch(Name) + .Case("vpdpwssd.128", Intrinsic::x86_avx512_vpdpwssd_128) + .Case("vpdpwssd.256", Intrinsic::x86_avx512_vpdpwssd_256) + .Case("vpdpwssd.512", Intrinsic::x86_avx512_vpdpwssd_512) + .Case("vpdpwssds.128", Intrinsic::x86_avx512_vpdpwssds_128) + .Case("vpdpwssds.256", Intrinsic::x86_avx512_vpdpwssds_256) + .Case("vpdpwssds.512", Intrinsic::x86_avx512_vpdpwssds_512) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddWords(F, ID, NewFn); } return false; // No other 'x86.avx512.*'. } - if (Name.consume_front("avx2.vpdpb")) { - // Added in 21.1 - ID = StringSwitch(Name) - .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128) - .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256) - .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128) - .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256) - .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128) - .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256) - .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128) - .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256) - .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128) - .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256) - .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128) - .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256) - .Default(Intrinsic::not_intrinsic); - if (ID != Intrinsic::not_intrinsic) - return upgradeX86MultiplyAddBytes(F, ID, NewFn); + if (Name.consume_front("avx2.")) { + if (Name.consume_front("vpdpb")) { + // Added in 21.1 + ID = StringSwitch(Name) + .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128) + .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256) + .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128) + .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256) + .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128) + .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256) + .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128) + .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256) + .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128) + .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256) + .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128) + .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddBytes(F, ID, NewFn); + } else if (Name.consume_front("vpdpw")) { + // Added in 21.1 + ID = StringSwitch(Name) + .Case("sud.128", Intrinsic::x86_avx2_vpdpwsud_128) + .Case("sud.256", Intrinsic::x86_avx2_vpdpwsud_256) + .Case("suds.128", Intrinsic::x86_avx2_vpdpwsuds_128) + .Case("suds.256", Intrinsic::x86_avx2_vpdpwsuds_256) + .Case("usd.128", Intrinsic::x86_avx2_vpdpwusd_128) + .Case("usd.256", Intrinsic::x86_avx2_vpdpwusd_256) + .Case("usds.128", Intrinsic::x86_avx2_vpdpwusds_128) + .Case("usds.256", Intrinsic::x86_avx2_vpdpwusds_256) + .Case("uud.128", Intrinsic::x86_avx2_vpdpwuud_128) + .Case("uud.256", Intrinsic::x86_avx2_vpdpwuud_256) + .Case("uuds.128", Intrinsic::x86_avx2_vpdpwuuds_128) + .Case("uuds.256", Intrinsic::x86_avx2_vpdpwuuds_256) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddWords(F, ID, NewFn); + } return false; // No other 'x86.avx2.*' } - if (Name.consume_front("avx10.vpdpb")) { - // Added in 21.1 - ID = StringSwitch(Name) - .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512) - .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512) - .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512) - .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512) - .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512) - .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512) - .Default(Intrinsic::not_intrinsic); - if (ID != Intrinsic::not_intrinsic) - return upgradeX86MultiplyAddBytes(F, ID, NewFn); + if (Name.consume_front("avx10.")) { + if (Name.consume_front("vpdpb")) { + // Added in 21.1 + ID = StringSwitch(Name) + .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512) + .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512) + .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512) + .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512) + .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512) + .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddBytes(F, ID, NewFn); + } else if (Name.consume_front("vpdpw")) { + ID = StringSwitch(Name) + .Case("sud.512", Intrinsic::x86_avx10_vpdpwsud_512) + .Case("suds.512", Intrinsic::x86_avx10_vpdpwsuds_512) + .Case("usd.512", Intrinsic::x86_avx10_vpdpwusd_512) + .Case("usds.512", Intrinsic::x86_avx10_vpdpwusds_512) + .Case("uud.512", Intrinsic::x86_avx10_vpdpwuud_512) + .Case("uuds.512", Intrinsic::x86_avx10_vpdpwuuds_512) + .Default(Intrinsic::not_intrinsic); + if (ID != Intrinsic::not_intrinsic) + return upgradeX86MultiplyAddWords(F, ID, NewFn); + } return false; // No other 'x86.avx10.*' } @@ -4307,6 +4371,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; + + // Input arguments types were incorrectly set to vectors of i32 before but + // they should be vectors of i16. Insert bit cast when encountering the old + // types + if (Args[1]->getType()->isVectorTy() && + cast(Args[1]->getType()) + ->getElementType() + ->isIntegerTy(32) && + Args[2]->getType()->isVectorTy() && + cast(Args[2]->getType()) + ->getElementType() + ->isIntegerTy(32)) { + Type *NewArgType = nullptr; + if (VecWidth == 128) + NewArgType = VectorType::get(Builder.getInt16Ty(), 8, false); + else if (VecWidth == 256) + NewArgType = VectorType::get(Builder.getInt16Ty(), 16, false); + else if (VecWidth == 512) + NewArgType = VectorType::get(Builder.getInt16Ty(), 32, false); + else + llvm_unreachable("Unexpected vector bit width"); + + Args[1] = Builder.CreateBitCast(Args[1], NewArgType); + Args[2] = Builder.CreateBitCast(Args[2], NewArgType); + } + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); @@ -5382,6 +5472,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { NewCall = Builder.CreateCall(NewFn, Args); break; } + case Intrinsic::x86_avx512_vpdpwssd_128: + case Intrinsic::x86_avx512_vpdpwssd_256: + case Intrinsic::x86_avx512_vpdpwssd_512: + case Intrinsic::x86_avx512_vpdpwssds_128: + case Intrinsic::x86_avx512_vpdpwssds_256: + case Intrinsic::x86_avx512_vpdpwssds_512: + case Intrinsic::x86_avx2_vpdpwsud_128: + case Intrinsic::x86_avx2_vpdpwsud_256: + case Intrinsic::x86_avx10_vpdpwsud_512: + case Intrinsic::x86_avx2_vpdpwsuds_128: + case Intrinsic::x86_avx2_vpdpwsuds_256: + case Intrinsic::x86_avx10_vpdpwsuds_512: + case Intrinsic::x86_avx2_vpdpwusd_128: + case Intrinsic::x86_avx2_vpdpwusd_256: + case Intrinsic::x86_avx10_vpdpwusd_512: + case Intrinsic::x86_avx2_vpdpwusds_128: + case Intrinsic::x86_avx2_vpdpwusds_256: + case Intrinsic::x86_avx10_vpdpwusds_512: + case Intrinsic::x86_avx2_vpdpwuud_128: + case Intrinsic::x86_avx2_vpdpwuud_256: + case Intrinsic::x86_avx10_vpdpwuud_512: + case Intrinsic::x86_avx2_vpdpwuuds_128: + case Intrinsic::x86_avx2_vpdpwuuds_256: + case Intrinsic::x86_avx10_vpdpwuuds_512: + unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 16; + Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2)}; + Type *NewArgType = VectorType::get(Builder.getInt16Ty(), NumElts, false); + Args[1] = Builder.CreateBitCast(Args[1], NewArgType); + Args[2] = Builder.CreateBitCast(Args[2], NewArgType); + + NewCall = Builder.CreateCall(NewFn, Args); + break; } assert(NewCall && "Should have either set this variable or returned through " "the default case"); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index ceeece41782f4..54d5145ad64a5 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -5875,52 +5875,118 @@ struct MemorySanitizerVisitor : public InstVisitor { // // Multiply and Add Signed Word Integers // < 4 x i32> @llvm.x86.avx512.vpdpwssd.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) // < 8 x i32> @llvm.x86.avx512.vpdpwssd.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <16 x i16>, <16 x i16>) // <16 x i32> @llvm.x86.avx512.vpdpwssd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <32 x i16>, <32 x i16>) // // Multiply and Add Signed Word Integers With Saturation // < 4 x i32> @llvm.x86.avx512.vpdpwssds.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) // < 8 x i32> @llvm.x86.avx512.vpdpwssds.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <16 x i16>, <16 x i16>) // <16 x i32> @llvm.x86.avx512.vpdpwssds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Signed and Unsigned Word Integers + // < 4 x i32> @llvm.x86.avx2.vpdpwsud.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwsud.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwsud.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Signed and Unsigned Word Integers With Saturation + // < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwsuds.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Unsigned and Signed Word Integers + // < 4 x i32> @llvm.x86.avx2.vpdpwusd.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwusd.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwusd.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Unsigned and Signed Word Integers With Saturation + // < 4 x i32> @llvm.x86.avx2.vpdpwusds.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwusds.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwusds.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Unsigned and Unsigned Word Integers + // < 4 x i32> @llvm.x86.avx2.vpdpwuud.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwuud.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwuud.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) + // + // Multiply and Add Unsigned and Unsigned Word Integers With Saturation + // < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128 + // (< 4 x i32>, < 8 x i16>, < 8 x i16>) + // < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256 + // (< 8 x i32>, <16 x i16>, <16 x i16>) + // <16 x i32> @llvm.x86.avx10.vpdpwuuds.512 + // (<16 x i32>, <32 x i16>, <32 x i16>) // // These intrinsics are auto-upgraded into non-masked forms: // <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <8 x i16>, <8 x i16>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <8 x i16>, <8 x i16>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <16 x i16>, <16 x i16>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <16 x i16>, <16 x i16>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <32 x i16>, <32 x i16>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <32 x i16>, <32 x i16>, i16) // // <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <8 x i16>, <8 x i16>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <8 x i16>, <8 x i16>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <16 x i16>, <16 x i16>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <16 x i16>, <16 x i16>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <32 x i16>, <32 x i16>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <32 x i16>, <32 x i16>, i16) case Intrinsic::x86_avx512_vpdpwssd_128: case Intrinsic::x86_avx512_vpdpwssd_256: case Intrinsic::x86_avx512_vpdpwssd_512: case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: + case Intrinsic::x86_avx2_vpdpwsud_128: + case Intrinsic::x86_avx2_vpdpwsud_256: + case Intrinsic::x86_avx10_vpdpwsud_512: + case Intrinsic::x86_avx2_vpdpwsuds_128: + case Intrinsic::x86_avx2_vpdpwsuds_256: + case Intrinsic::x86_avx10_vpdpwsuds_512: + case Intrinsic::x86_avx2_vpdpwusd_128: + case Intrinsic::x86_avx2_vpdpwusd_256: + case Intrinsic::x86_avx10_vpdpwusd_512: + case Intrinsic::x86_avx2_vpdpwusds_128: + case Intrinsic::x86_avx2_vpdpwusds_256: + case Intrinsic::x86_avx10_vpdpwusds_512: + case Intrinsic::x86_avx2_vpdpwuud_128: + case Intrinsic::x86_avx2_vpdpwuud_256: + case Intrinsic::x86_avx10_vpdpwuud_512: + case Intrinsic::x86_avx2_vpdpwuuds_128: + case Intrinsic::x86_avx2_vpdpwuuds_256: + case Intrinsic::x86_avx10_vpdpwuuds_512: handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll index 76d84c1159ee4..860d60ff0d4e1 100644 --- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll @@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) ret <16 x i32> %res } + +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwsud_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwsud_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwusd_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwusd_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwusds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwusds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwuud_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwuud_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) + +define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512: +; X86: # %bb.0: +; X86-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512: +; X64: # %bb.0: +; X64-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll index a2aad604f19bc..e9c6cb6a19ba4 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll @@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8 ; VNNI INT16 -define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { +define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) { ; X86-LABEL: test_mm512_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr ; X64: # %bb.0: ; X64-NEXT: vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07] ; X64-NEXT: retq # encoding: [0xc3] - %__B = load <16 x i32>, ptr %pB - %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %__B = load <32 x i16>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) ret <16 x i32> %res } -define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) { +define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) { ; X86-LABEL: test_mm512_mask_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] @@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_ ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W ret <16 x i32> %res } -define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) { +define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) { ; X86-LABEL: test_mm512_maskz_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] @@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_ ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) -declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>) define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) { ; X86-LABEL: test_mm512_dpwusd_epi32: diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll index 1f270d539cdb4..bf7f9375570f9 100644 --- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll @@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>) ; VNNI INT16 -define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { +define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) { ; X86-LABEL: test_mm_mask_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W ret <4 x i32> %res } -define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) { +define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) { ; X86-LABEL: test_mm_maskz_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, < ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer ret <4 x i32> %res } -define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) { +define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) { ; X86-LABEL: test_mm256_maskz_dpwsuds_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W ret <8 x i32> %res } -define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) { +define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) { ; X86-LABEL: test_mm256_mask_dpwsud_epi32: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] @@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>) define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) { ; X86-LABEL: test_mm_mask_dpwusd_epi32: diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll index b8ebe2a4890a1..ddf0050dbd74a 100644 --- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll @@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; X64-NEXT: vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <16 x i16>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; X64-NEXT: vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer @@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; X64-NEXT: vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <16 x i16>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) { +define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) { ; X86-LABEL: test_int_x86_avx512_vpdpwssds_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; X64: # %bb.0: ; X64-NEXT: vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; X64-NEXT: vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <8 x i16>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll index 63ff88a7fa4ae..35363e99ef4c8 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll @@ -102,11 +102,22 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x ret { <16 x i32>, <16 x i32> } %res3 } +declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpdpwssd: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) -define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { -; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512: +define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] @@ -114,8 +125,8 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x ret <16 x i32> %res } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -125,7 +136,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: +; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] @@ -141,20 +152,31 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res3 } -declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) -declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) ret <16 x i32> %res } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] @@ -164,7 +186,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: +; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512: ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll index 60d0298e057f3..e97b8a5c5503f 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll @@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <32 x i16>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 @@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) { ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <32 x i16>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll index 0f4a4f27b9715..f359ecef8ceb3 100644 --- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll @@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) ret <8 x i32> %res } + +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %res +} + +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll index de8b2a41bf8c8..5748a426c76c3 100644 --- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll @@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1 ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2] @@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2] @@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2] @@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) -define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2] @@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) ret <4 x i32> %res } diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll new file mode 100644 index 0000000000000..abdc296ae1e1c --- /dev/null +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 + +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + ret <4 x i32> %ret +} +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256: +; AVX10: # %bb.0: +; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2] +; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + ret <8 x i32> %ret +} +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll index abdc296ae1e1c..7576b12645bd0 100644 --- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10 -define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2] @@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2] @@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2] @@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2] @@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2] @@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2] @@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2] @@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2] @@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2] @@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2] @@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2] @@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2] @@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; AVX10: # %bb.0: ; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2] ; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll index cd576b19f8766..345fa0efa42df 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll @@ -4,16 +4,16 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" -declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>) declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32 ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, ; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1) ret <8 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> % ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1 ; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1) ret <8 x i32> %2 } diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll index 534352f322001..47537c8c4fb6e 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) @@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) { +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) { ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 ; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8] ; CHECK-NEXT: retq # encoding: [0xc3] %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll index 8900085af030d..9d5cabca5fef8 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll @@ -497,12 +497,12 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8> declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>) -define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] @@ -510,87 +510,123 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable ; CHECK: [[BB5]]: -; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 ; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) -; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) +; CHECK-NEXT: store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %__B = load <16 x i32>, ptr %pB - %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %__B = load <32 x i16>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) ret <16 x i32> %res } -define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = and <32 x i1> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP19]], [[TMP22]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP23]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]] -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]] ; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]] ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W ret <16 x i32> %res } -define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32( -; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = and <32 x i1> [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP20]], [[TMP23]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP24]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> -; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]] +; CHECK-NEXT: [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]] ; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]] ; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>) -declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] @@ -598,33 +634,57 @@ define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable ; CHECK: [[BB5]]: -; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 ; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %__B = load <16 x i32>, ptr %pB - %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %__B = load <32 x i16>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) ret <16 x i32> %res } -define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] @@ -636,23 +696,35 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_ ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W ret <16 x i32> %res } -define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32( -; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer @@ -664,21 +736,21 @@ define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %_ ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>) -declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <32 x i16>, <32 x i16>) -define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory { +define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] @@ -686,33 +758,57 @@ define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] ; CHECK-NEXT: unreachable ; CHECK: [[BB5]]: -; CHECK-NEXT: [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64 +; CHECK-NEXT: [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 ; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64 -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]] -; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]] +; CHECK-NEXT: [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %__B = load <16 x i32>, ptr %pB - %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %__B = load <32 x i16>, ptr %pB + %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) ret <16 x i32> %res } -define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32( -; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]] @@ -724,23 +820,35 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_ ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W ret <16 x i32> %res } -define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory { +define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32( -; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer @@ -752,14 +860,14 @@ define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %_ ; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[RES]] ; - %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) + %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) %bst = bitcast i16 %__U to <16 x i1> %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer ret <16 x i32> %res } -declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>) -declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <32 x i16>, <32 x i16>) +declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <32 x i16>, <32 x i16>) define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory { diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll index def7ba3f10770..6a53a1595271e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll @@ -739,17 +739,29 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>) declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32( -; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] @@ -761,23 +773,35 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W ret <4 x i32> %res } -define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32( -; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer @@ -789,23 +813,35 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, < ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer ret <4 x i32> %res } -define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32( -; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] @@ -817,23 +853,35 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W ret <8 x i32> %res } -define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32( -; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer @@ -845,28 +893,40 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32( -; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] @@ -878,23 +938,35 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W ret <4 x i32> %res } -define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32( -; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer @@ -906,23 +978,35 @@ define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, < ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer ret <4 x i32> %res } -define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32( -; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] @@ -934,23 +1018,35 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W ret <8 x i32> %res } -define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32( -; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer @@ -962,28 +1058,40 @@ define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <16 x i16>, <16 x i16>) -define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32( -; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]] @@ -995,23 +1103,35 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W ret <4 x i32> %res } -define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory { +define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32( -; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer @@ -1023,23 +1143,35 @@ define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, < ; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) + %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) %bst = bitcast i4 %__U to <4 x i1> %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer ret <4 x i32> %res } -define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32( -; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]] @@ -1051,23 +1183,35 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W ret <8 x i32> %res } -define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory { +define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32( -; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]] +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]] +; CHECK-NEXT: [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer @@ -1079,16 +1223,16 @@ define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, ; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) + %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) %bst = bitcast i8 %__U to <8 x i1> %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <16 x i16>, <16 x i16>) define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory { diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll index 5e937485ff282..d0daea5e68fea 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll @@ -528,10 +528,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer @@ -546,7 +546,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -574,10 +574,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer @@ -592,7 +592,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -601,10 +601,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer @@ -619,7 +619,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -653,10 +653,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer @@ -671,7 +671,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; @@ -699,10 +699,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer @@ -717,7 +717,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -728,10 +728,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer @@ -746,7 +746,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> @@ -783,10 +783,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer @@ -801,7 +801,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -829,10 +829,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer @@ -847,7 +847,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -856,10 +856,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer @@ -874,7 +874,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -908,10 +908,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer @@ -926,7 +926,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; @@ -954,10 +954,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer @@ -972,7 +972,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -983,10 +983,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer @@ -1001,7 +1001,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll index 1d3046804b74f..f2b1e16e3d5c2 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll @@ -495,10 +495,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer @@ -513,7 +513,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -541,10 +541,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer @@ -559,7 +559,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -568,10 +568,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer @@ -586,7 +586,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -623,10 +623,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer @@ -641,7 +641,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; @@ -669,10 +669,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer @@ -687,7 +687,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -698,10 +698,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer @@ -716,7 +716,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> @@ -757,10 +757,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer @@ -775,7 +775,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -803,10 +803,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer @@ -821,7 +821,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -830,10 +830,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer @@ -848,7 +848,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32 ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -896,10 +896,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080 ; CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16 -; CHECK-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer @@ -914,7 +914,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer ; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]] -; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP26]], <8 x i16> [[TMP10]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; @@ -943,10 +943,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer @@ -961,7 +961,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -972,10 +972,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer @@ -990,7 +990,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32 ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll index 5c99f8a3a1fb6..4e7598b92abcf 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll @@ -270,10 +270,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer @@ -288,7 +288,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -316,10 +316,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer @@ -334,7 +334,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -343,10 +343,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer @@ -361,7 +361,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer @@ -395,10 +395,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> % ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer @@ -413,7 +413,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> % ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -441,10 +441,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer @@ -459,7 +459,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -468,10 +468,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer @@ -486,7 +486,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll index 236ff45c6cd08..0fd60149cc15e 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll @@ -251,10 +251,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer @@ -269,7 +269,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -297,10 +297,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer @@ -315,7 +315,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -324,10 +324,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer @@ -342,7 +342,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer @@ -379,10 +379,10 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3 ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer @@ -397,7 +397,7 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -425,10 +425,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16> ; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer @@ -443,7 +443,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -452,10 +452,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16> ; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16> ; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer @@ -470,7 +470,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x ; CHECK-NEXT: [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer ; CHECK-NEXT: [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll index 0344fbd5ee2a9..0a6c5f7b089ba 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll @@ -143,10 +143,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer @@ -161,7 +161,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; @@ -178,10 +178,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer @@ -196,7 +196,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; @@ -213,10 +213,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer @@ -231,7 +231,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; @@ -248,10 +248,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer @@ -266,7 +266,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll index 707b46bb8686e..fd9e0b953d002 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll @@ -22,218 +22,362 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) -; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) +; CHECK-NEXT: store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) -; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) +; CHECK-NEXT: store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) -; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) +; CHECK-NEXT: store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) -; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) +; CHECK-NEXT: store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) -define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory { +define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128( -; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RET]] ; - %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) + %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) ret <4 x i32> %ret } -declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) +declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) -define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory { +define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256( -; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] { +; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32> +; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RET]] ; - %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) + %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) ret <8 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) +declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)