diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 527acd9ef086e..dea55794ee4ce 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1108,27 +1108,27 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">; } let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; + def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; + def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">; } let Features = "avx512vnni,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; + def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">; } let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index d1e5cd9d6983f..3ae4c3be57542 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -41,8 +41,8 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpbusd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpbusd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v32qi)(A), (__v32qi)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -61,8 +61,9 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -#define _mm256_dpbusds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) +#define _mm256_dpbusds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v32qi)(A), \ + (__v32qi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -117,8 +118,8 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpbusd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpbusd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v16qi)(A), (__v16qi)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -137,8 +138,9 @@ /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -#define _mm_dpbusds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) +#define _mm_dpbusds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v16qi)(A), \ + (__v16qi)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index 0fb381a12f2fd..1c8769c821fe2 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -22,8 +22,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qi)__A, + (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -45,8 +45,8 @@ _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); + return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v64qi)__A, + (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h index b7de562b57c06..ebb72bbaf6657 100644 --- a/clang/lib/Headers/avxvnniintrin.h +++ b/clang/lib/Headers/avxvnniintrin.h @@ -63,7 +63,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v32qi)__A, + (__v32qi)__B); } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with @@ -86,7 +87,8 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); + return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v32qi)__A, + (__v32qi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with @@ -151,7 +153,8 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v16qi)__A, + (__v16qi)__B); } /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with @@ -174,7 +177,8 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); + return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v16qi)__A, + (__v16qi)__B); } /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c index 3de4cca1a7e23..f63b5c6e73917 100644 --- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c @@ -7,41 +7,41 @@ __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpbusd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B); } __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpbusd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B); } __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_epi32(__S, __A, __B); } __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpbusds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B); } __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpbusds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B); } __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_epi32(__S, __A, __B); } @@ -87,41 +87,41 @@ __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpbusd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpbusd_epi32(__S, __U, __A, __B); } __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpbusd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B); } __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_epi32(__S, __A, __B); } __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpbusds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpbusds_epi32(__S, __U, __A, __B); } __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpbusds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B); } __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_epi32(__S, __A, __B); } diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c index a0177b3ba0a2c..afe80458e37cc 100644 --- a/clang/test/CodeGen/X86/avx512vnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c @@ -7,41 +7,41 @@ __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpbusd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpbusd_epi32(__S, __U, __A, __B); } __m512i test_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpbusd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpbusd_epi32(__U, __S, __A, __B); } __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpbusd_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) return _mm512_dpbusd_epi32(__S, __A, __B); } __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpbusds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B); } __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpbusds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B); } __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpbusds_epi32 - // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}) + // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) return _mm512_dpbusds_epi32(__S, __A, __B); } diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c index bb28a359424c8..7948e0d57d9bf 100644 --- a/clang/test/CodeGen/X86/avxvnni-builtins.c +++ b/clang/test/CodeGen/X86/avxvnni-builtins.c @@ -7,13 +7,13 @@ __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_epi32(__S, __A, __B); } __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_epi32(__S, __A, __B); } @@ -31,13 +31,13 @@ __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_epi32(__S, __A, __B); } __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_epi32(__S, __A, __B); } @@ -55,13 +55,13 @@ __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { __m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_avx_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_avx_epi32(__S, __A, __B); } __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_avx_epi32 - // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}) + // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_avx_epi32(__S, __A, __B); } @@ -79,13 +79,13 @@ __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { __m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_avx_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_avx_epi32(__S, __A, __B); } __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_avx_epi32 - // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_avx_epi32(__S, __A, __B); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index dff5785531ada..4af9ffc52ba6b 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1867,29 +1867,29 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vpdpbusd_128 : ClangBuiltin<"__builtin_ia32_vpdpbusd128">, - DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, - llvm_v4i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusd_256 : ClangBuiltin<"__builtin_ia32_vpdpbusd256">, - DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusd_512 : ClangBuiltin<"__builtin_ia32_vpdpbusd512">, - DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty, + llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_128 : ClangBuiltin<"__builtin_ia32_vpdpbusds128">, - DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, - llvm_v4i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_256 : ClangBuiltin<"__builtin_ia32_vpdpbusds256">, - DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, - llvm_v8i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_512 : ClangBuiltin<"__builtin_ia32_vpdpbusds512">, - DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, - llvm_v16i32_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty, + llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_128 : ClangBuiltin<"__builtin_ia32_vpdpwssd128">, diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index e200f3626e69d..bb5e9ab0830c8 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4148,6 +4148,34 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; + + // Input arguments types were incorrectly set to vectors of i32 before but + // they should be vectors of i8. Insert bit cast when encountering the old + // types + if (Args[1]->getType()->isVectorTy() && + cast(Args[1]->getType()) + ->getElementType() + ->isIntegerTy(32) && + Args[2]->getType()->isVectorTy() && + cast(Args[2]->getType()) + ->getElementType() + ->isIntegerTy(32)) { + Type *NewArgType = nullptr; + if (VecWidth == 128) + NewArgType = VectorType::get(Builder.getInt8Ty(), 16, false); + else if (VecWidth == 256) + NewArgType = VectorType::get(Builder.getInt8Ty(), 32, false); + else if (VecWidth == 512) + NewArgType = VectorType::get(Builder.getInt8Ty(), 64, false); + else + llvm_unreachable("Unexpected vector bit width"); + + if (NewArgType) { + Args[1] = Builder.CreateBitCast(Args[1], NewArgType); + Args[2] = Builder.CreateBitCast(Args[2], NewArgType); + } + } + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 27292d1a66c30..a0c89c95d8633 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3860,7 +3860,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // // Three operands: // <4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b) + // (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b) // (this is equivalent to multiply-add on %a and %b, followed by // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction @@ -3902,8 +3902,9 @@ struct MemorySanitizerVisitor : public InstVisitor { ReturnType->getPrimitiveSizeInBits()); if (I.arg_size() == 3) { - assert(ParamType == ReturnType); - assert(ParamType == I.getArgOperand(0)->getType()); + FixedVectorType *AccumulatorType = + cast(I.getOperand(0)->getType()); + assert(AccumulatorType == ReturnType); } FixedVectorType *ImplicitReturnType = ReturnType; @@ -5621,19 +5622,19 @@ struct MemorySanitizerVisitor : public InstVisitor { // // Multiply and Add Packed Signed and Unsigned Bytes // < 4 x i32> @llvm.x86.avx512.vpdpbusd.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusd.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // Multiply and Add Unsigned and Signed Bytes With Saturation // < 4 x i32> @llvm.x86.avx512.vpdpbusds.128 - // (< 4 x i32>, < 4 x i32>, < 4 x i32>) + // (< 4 x i32>, <16 x i8>, <16 x i8>) // < 8 x i32> @llvm.x86.avx512.vpdpbusds.256 - // (< 8 x i32>, < 8 x i32>, < 8 x i32>) + // (< 8 x i32>, <32 x i8>, <32 x i8>) // <16 x i32> @llvm.x86.avx512.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>) + // (<16 x i32>, <64 x i8>, <64 x i8>) // // < 4 x i32> @llvm.x86.avx2.vpdpbssd.128 // (< 4 x i32>, < 4 x i32>, < 4 x i32>) @@ -5652,30 +5653,30 @@ struct MemorySanitizerVisitor : public InstVisitor { // // These intrinsics are auto-upgraded into non-masked forms: // <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // // <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128 - // (<4 x i32>, <4 x i32>, <4 x i32>, i8) + // (<4 x i32>, <16 x i8>, <16 x i8>, i8) // <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256 - // (<8 x i32>, <8 x i32>, <8 x i32>, i8) + // (<8 x i32>, <32 x i8>, <32 x i8>, i8) // <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) // <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512 - // (<16 x i32>, <16 x i32>, <16 x i32>, i16) + // (<16 x i32>, <64 x i8>, <64 x i8>, i16) case Intrinsic::x86_avx512_vpdpbusd_128: case Intrinsic::x86_avx512_vpdpbusd_256: case Intrinsic::x86_avx512_vpdpbusd_512: diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll index 7613c9ff43e21..b8ebe2a4890a1 100644 --- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll @@ -2,18 +2,18 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 -declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x50,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -33,11 +33,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; X64-NEXT: vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <32 x i8>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -45,18 +45,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x50,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -76,12 +76,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; X64-NEXT: vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <16 x i8>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer @@ -90,18 +90,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x51,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] @@ -121,11 +121,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; X64-NEXT: vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda] ; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <32 x i8>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -133,18 +133,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_128: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x51,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128: ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] @@ -164,12 +164,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; X64-NEXT: vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda] ; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <16 x i8>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll index 21d0010ff6303..60d0298e057f3 100644 --- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll @@ -2,18 +2,18 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 -declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8>) -define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpbusd_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -32,11 +32,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; X64-NEXT: vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <64 x i8>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 @@ -44,18 +44,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i8>) -define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpbusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xc2] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512: ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] @@ -74,11 +74,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; X64-NEXT: vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda] ; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <64 x i8>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll index a1db6e54fa796..de8b2a41bf8c8 100644 --- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll @@ -4,9 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI -declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2] @@ -16,13 +16,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2] @@ -32,13 +32,13 @@ define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2] @@ -48,13 +48,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) { ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128: ; AVXVNNI: # %bb.0: ; AVXVNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2] @@ -64,7 +64,7 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; AVX512VNNI: # %bb.0: ; AVX512VNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2] ; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %res } diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll index 4b0f63f9a6389..cd576b19f8766 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll @@ -8,10 +8,10 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>) declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>) declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) -declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) -declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { ; CHECK-LABEL: stack_fold_vpdpwssd: @@ -125,7 +125,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1 ret <8 x i32> %2 } -define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -135,11 +135,11 @@ define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a ; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -150,11 +150,11 @@ define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpbusd %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -164,11 +164,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32 ; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -179,11 +179,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, ; CHECK-NEXT: {vex} vpdpbusd %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1) ret <8 x i32> %2 } -define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -193,11 +193,11 @@ define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> % ; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) ret <4 x i32> %2 } -define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { +define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -208,11 +208,11 @@ define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 ; CHECK-NEXT: {vex} vpdpbusds %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1) + %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1) ret <4 x i32> %2 } -define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_256: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -222,11 +222,11 @@ define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3 ; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) ret <8 x i32> %2 } -define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { +define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) { ; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -237,6 +237,6 @@ define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1 ; CHECK-NEXT: {vex} vpdpbusds %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1) + %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1) ret <8 x i32> %2 } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll index 822e546c84bca..777ca4548c71f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll @@ -20,10 +20,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer @@ -40,7 +40,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP24]], <32 x i8> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -68,10 +68,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer @@ -88,7 +88,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP30]], <32 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -97,10 +97,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer @@ -117,7 +117,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[TMP39]], <32 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -151,10 +151,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer @@ -171,7 +171,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP24]], <16 x i8> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; @@ -199,10 +199,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer @@ -219,7 +219,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP30]], <16 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -230,10 +230,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer @@ -250,7 +250,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[TMP39]], <16 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> @@ -286,10 +286,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer @@ -306,7 +306,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP24]], <32 x i8> [[TMP5]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; @@ -334,10 +334,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer @@ -354,7 +354,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP30]], <32 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -363,10 +363,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer @@ -383,7 +383,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[TMP39]], <32 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -417,10 +417,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer @@ -437,7 +437,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP24]], <16 x i8> [[TMP5]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; @@ -465,10 +465,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer @@ -485,7 +485,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP2:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP30]], <16 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -496,10 +496,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer @@ -516,7 +516,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP5:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[TMP39]], <16 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll index 38f4272ef106f..244eb54067ea8 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll @@ -10,26 +10,22 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8> @@ -39,22 +35,22 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory { ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] @@ -62,22 +58,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <32 x i8>, ptr [[TMP10]], align 32 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP31]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP35]], [[TMP32]] ; CHECK-NEXT: [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8> @@ -87,7 +79,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -96,17 +88,13 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP37]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP39]], [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8> @@ -116,7 +104,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -132,11 +120,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] ; - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <32 x i8>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -144,26 +132,22 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8> @@ -173,22 +157,22 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory { ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -196,22 +180,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[TMP10]], align 16 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP31]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP35]], [[TMP32]] ; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8> @@ -221,7 +201,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -232,17 +212,13 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP37]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP39]], [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8> @@ -252,7 +228,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> @@ -270,12 +246,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] ; - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <16 x i8>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer @@ -284,26 +260,22 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> ret { <4 x i32>, <4 x i32> } %res2 } -declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8> @@ -313,22 +285,22 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[TMP4]] ; - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %1 } -define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory { +define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory { ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -336,22 +308,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32 +; CHECK-NEXT: [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <32 x i8>, ptr [[TMP10]], align 32 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <32 x i1> [[TMP31]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <32 x i1> [[TMP35]], [[TMP32]] ; CHECK-NEXT: [[TMP60:%.*]] = or <32 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <32 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <32 x i1> [[TMP61]] to <32 x i8> @@ -361,7 +329,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <16 x i16> [[TMP65]] to i256 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i256 [[TMP66]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]] @@ -370,17 +338,13 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <8 x i32> [[TMP5]] to <32 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <32 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <32 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <32 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <32 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <32 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <32 x i1> [[TMP37]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <32 x i1> [[TMP39]], [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = or <32 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <32 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <32 x i1> [[TMP51]] to <32 x i8> @@ -390,7 +354,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <16 x i16> [[TMP55]] to i256 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i256 [[TMP56]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <8 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer @@ -406,11 +370,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ; CHECK-NEXT: store { <8 x i32>, <8 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <8 x i32>, <8 x i32> } [[RES2]] ; - %x2 = load <8 x i32>, ptr %x2p - %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %x2 = load <32 x i8>, ptr %x2p + %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0 - %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4) + %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0 @@ -418,26 +382,22 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32 ret { <8 x i32>, <8 x i32> } %res2 } -declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8> @@ -447,22 +407,22 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %1 } -define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory { +define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory { ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -470,22 +430,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16 +; CHECK-NEXT: [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <16 x i8>, ptr [[TMP10]], align 16 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <16 x i1> [[TMP31]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <16 x i1> [[TMP35]], [[TMP32]] ; CHECK-NEXT: [[TMP60:%.*]] = or <16 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i8> @@ -495,7 +451,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <8 x i16> [[TMP65]] to i128 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i128 [[TMP66]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> @@ -506,17 +462,13 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <16 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <16 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <16 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <16 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <16 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <16 x i1> [[TMP37]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i1> [[TMP39]], [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = or <16 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <16 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <16 x i1> [[TMP51]] to <16 x i8> @@ -526,7 +478,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <8 x i16> [[TMP55]] to i128 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP4:%.*]] = or <4 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1> ; CHECK-NEXT: [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> @@ -544,12 +496,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32 ; CHECK-NEXT: store { <4 x i32>, <4 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } [[RES2]] ; - %x2 = load <4 x i32>, ptr %x2p - %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %x2 = load <16 x i8>, ptr %x2p + %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0 - %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4) + %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4) %5 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll index f146823b90e03..b64c033f8f882 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll @@ -20,10 +20,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer @@ -40,7 +40,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP24]], <64 x i8> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -68,10 +68,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer @@ -88,7 +88,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP30]], <64 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -97,10 +97,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer @@ -117,7 +117,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[TMP39]], <64 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer @@ -151,10 +151,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> % ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer @@ -171,7 +171,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> % ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP24]], <64 x i8> [[TMP5]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; @@ -199,10 +199,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer @@ -219,7 +219,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP30]], <64 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -228,10 +228,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> ; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8> +; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer ; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer @@ -248,7 +248,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[TMP39]], <64 x i8> [[TMP40]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll index 7c39ff6bb2be1..1b8c43f699e1f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll @@ -10,26 +10,22 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8>) -define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512( -; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8> @@ -39,22 +35,22 @@ define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i3 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X2]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory { ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512( -; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]] @@ -62,22 +58,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP31:%.*]] = load <64 x i8>, ptr [[X2P]], align 64 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <64 x i8>, ptr [[TMP10]], align 64 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer ; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP32]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP35]], [[TMP37]] ; CHECK-NEXT: [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8> @@ -87,7 +79,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -96,17 +88,13 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <64 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP41]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP39]], [[TMP42]] ; CHECK-NEXT: [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8> @@ -116,7 +104,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer @@ -132,11 +120,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] ; - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <64 x i8>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 @@ -144,26 +132,22 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i ret { <16 x i32>, <16 x i32> } %res2 } -declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>) +declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i8>) -define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory { +define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512( -; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <64 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <64 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <64 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <64 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <64 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <64 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <64 x i1> [[TMP16]] to <64 x i8> @@ -173,22 +157,22 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> % ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x i16> [[TMP20]] to i512 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i512 [[TMP21]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X2]]) ; CHECK-NEXT: store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i32> [[TMP4]] ; - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) ret <16 x i32> %1 } -define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory { +define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory { ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512( -; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 +; CHECK-NEXT: [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] @@ -196,22 +180,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] ; CHECK-NEXT: unreachable ; CHECK: [[BB7]]: -; CHECK-NEXT: [[X2:%.*]] = load <16 x i32>, ptr [[X2P]], align 64 +; CHECK-NEXT: [[TMP31:%.*]] = load <64 x i8>, ptr [[X2P]], align 64 ; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[X2P]] to i64 ; CHECK-NEXT: [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080 ; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr -; CHECK-NEXT: [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP31:%.*]] = bitcast <16 x i32> [[X2]] to <64 x i8> -; CHECK-NEXT: [[TMP32:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP33:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8> -; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <64 x i8> [[TMP32]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = load <64 x i8>, ptr [[TMP10]], align 64 ; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <64 x i8> [[TMP30]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer ; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <64 x i8> [[TMP31]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP34]], [[TMP35]] -; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP36]], [[TMP35]] -; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP34]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = and <64 x i1> [[TMP35]], [[TMP36]] +; CHECK-NEXT: [[TMP58:%.*]] = and <64 x i1> [[TMP32]], [[TMP36]] +; CHECK-NEXT: [[TMP59:%.*]] = and <64 x i1> [[TMP35]], [[TMP37]] ; CHECK-NEXT: [[TMP60:%.*]] = or <64 x i1> [[TMP38]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = or <64 x i1> [[TMP60]], [[TMP59]] ; CHECK-NEXT: [[TMP62:%.*]] = sext <64 x i1> [[TMP61]] to <64 x i8> @@ -221,7 +201,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP66:%.*]] = bitcast <32 x i16> [[TMP65]] to i512 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i512 [[TMP66]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <16 x i32> [[TMP29]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]]) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[TMP31]]) ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]] @@ -230,17 +210,13 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]] ; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]] ; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]] -; CHECK-NEXT: [[TMP39:%.*]] = bitcast <16 x i32> [[X1]] to <64 x i8> -; CHECK-NEXT: [[TMP40:%.*]] = bitcast <16 x i32> [[X4]] to <64 x i8> -; CHECK-NEXT: [[TMP41:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8> -; CHECK-NEXT: [[TMP42:%.*]] = bitcast <16 x i32> [[TMP5]] to <64 x i8> -; CHECK-NEXT: [[TMP43:%.*]] = icmp ne <64 x i8> [[TMP41]], zeroinitializer -; CHECK-NEXT: [[TMP44:%.*]] = icmp ne <64 x i8> [[TMP42]], zeroinitializer -; CHECK-NEXT: [[TMP45:%.*]] = icmp ne <64 x i8> [[TMP39]], zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = icmp ne <64 x i8> [[TMP33]], zeroinitializer ; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <64 x i8> [[TMP40]], zeroinitializer -; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP43]], [[TMP44]] -; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP45]], [[TMP44]] -; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP43]], [[TMP46]] +; CHECK-NEXT: [[TMP41:%.*]] = icmp ne <64 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne <64 x i8> [[X4]], zeroinitializer +; CHECK-NEXT: [[TMP47:%.*]] = and <64 x i1> [[TMP39]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = and <64 x i1> [[TMP41]], [[TMP46]] +; CHECK-NEXT: [[TMP49:%.*]] = and <64 x i1> [[TMP39]], [[TMP42]] ; CHECK-NEXT: [[TMP50:%.*]] = or <64 x i1> [[TMP47]], [[TMP48]] ; CHECK-NEXT: [[TMP51:%.*]] = or <64 x i1> [[TMP50]], [[TMP49]] ; CHECK-NEXT: [[TMP52:%.*]] = sext <64 x i1> [[TMP51]] to <64 x i8> @@ -250,7 +226,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: [[TMP56:%.*]] = bitcast <32 x i16> [[TMP55]] to i512 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i512 [[TMP56]] to <16 x i32> ; CHECK-NEXT: [[_MSPROP3:%.*]] = or <16 x i32> [[TMP57]], [[TMP2]] -; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]]) +; CHECK-NEXT: [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> [[X0]], <64 x i8> [[X1]], <64 x i8> [[X4]]) ; CHECK-NEXT: [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1> ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1> ; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer @@ -266,11 +242,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x ; CHECK-NEXT: store { <16 x i32>, <16 x i32> } [[TMP28]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret { <16 x i32>, <16 x i32> } [[RES2]] ; - %x2 = load <16 x i32>, ptr %x2p - %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %x2 = load <64 x i8>, ptr %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0 - %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4) %5 = bitcast i16 %x3 to <16 x i1> %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0 diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll index 678faef203324..3087bc4e5ad8c 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll @@ -10,26 +10,22 @@ target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusd_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8> @@ -39,34 +35,30 @@ define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusd_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8> @@ -76,34 +68,30 @@ define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>) +declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>) -define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory { +define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusds_256( -; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <32 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <32 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <32 x i1> [[TMP16]] to <32 x i8> @@ -113,34 +101,30 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x i16> [[TMP20]] to i256 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i256 [[TMP21]] to <8 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]]) ; CHECK-NEXT: store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; - %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>) -define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory { +define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory { ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusds_128( -; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <16 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <16 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP6]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP7]] ; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i1> [[TMP12]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP15]], [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i8> @@ -150,11 +134,11 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP20]] to i128 ; CHECK-NEXT: [[TMP22:%.*]] = bitcast i128 [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP23]] -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]]) +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]]) ; CHECK-NEXT: store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; - %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) ret <4 x i32> %res }