diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d40475cd92b841..dd859c9269cfdc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -648,6 +648,10 @@ X86 Support in Clang
 - Support ISA of ``AVX-IFMA``.
   * Support intrinsic of ``_mm(256)_madd52hi_avx_epu64``.
   * Support intrinsic of ``_mm(256)_madd52lo_avx_epu64``.
+- Support ISA of ``AVX-VNNI-INT8``.
+  * Support intrinsic of ``_mm(256)_dpbssd(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpbsud(s)_epi32``.
+  * Support intrinsic of ``_mm(256)_dpbuud(s)_epi32``.
 
 WebAssembly Support in Clang
 ----------------------------
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index d7ed93885020fe..3fc77c95b76648 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -906,6 +906,7 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
 TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
 
+// AVX-VNNI and AVX512-VNNI
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
@@ -919,6 +920,20 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512v
 TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
 
+// AVX-VNNI-INT8
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
+TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
+
 TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4dvC*V4OiUcIi", "nV:256:", "avx512vl")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0e80afa73b2a4c..7c5974813ffa42 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4590,6 +4590,8 @@ def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Featur
 def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
 def mavxifma : Flag<["-"], "mavxifma">, Group<m_x86_Features_Group>;
 def mno_avxifma : Flag<["-"], "mno-avxifma">, Group<m_x86_Features_Group>;
+def mavxvnniint8 : Flag<["-"], "mavxvnniint8">, Group<m_x86_Features_Group>;
+def mno_avxvnniint8 : Flag<["-"], "mno-avxvnniint8">, Group<m_x86_Features_Group>;
 def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
 def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
 def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 78c032e2d7d4b1..442503a13025c5 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -342,6 +342,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVXIFMA = true;
     } else if (Feature == "+avxvnni") {
       HasAVXVNNI = true;
+    } else if (Feature == "+avxvnniint8") {
+      HasAVXVNNIINT8 = true;
     } else if (Feature == "+serialize") {
       HasSERIALIZE = true;
     } else if (Feature == "+tsxldtrk") {
@@ -796,6 +798,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AVXIFMA__");
   if (HasAVXVNNI)
     Builder.defineMacro("__AVXVNNI__");
+  if (HasAVXVNNIINT8)
+    Builder.defineMacro("__AVXVNNIINT8__");
   if (HasSERIALIZE)
     Builder.defineMacro("__SERIALIZE__");
   if (HasTSXLDTRK)
@@ -920,6 +924,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("avx512vp2intersect", true)
       .Case("avxifma", true)
       .Case("avxvnni", true)
+      .Case("avxvnniint8", true)
       .Case("bmi", true)
       .Case("bmi2", true)
       .Case("cldemote", true)
@@ -1019,6 +1024,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("avx512vp2intersect", HasAVX512VP2INTERSECT)
       .Case("avxifma", HasAVXIFMA)
       .Case("avxvnni", HasAVXVNNI)
+      .Case("avxvnni", HasAVXVNNI)
+      .Case("avxvnniint8", HasAVXVNNIINT8)
       .Case("bmi", HasBMI)
       .Case("bmi2", HasBMI2)
       .Case("cldemote", HasCLDEMOTE)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 7b67a4060ec3be..825087838941f9 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -141,6 +141,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXFP16 = false;
   bool HasCMPCCXADD = false;
   bool HasRAOINT = false;
+  bool HasAVXVNNIINT8 = false;
   bool HasKL = false;      // For key locker
   bool HasWIDEKL = false; // For wide key locker
   bool HasHRESET = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5a7f81b4ed07d4..fdf30240457793 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -144,6 +144,7 @@ set(x86_files
   avx512vpopcntdqvlintrin.h
   avxifmaintrin.h
   avxintrin.h
+  avxvnniint8intrin.h
   avxvnniintrin.h
   bmi2intrin.h
   bmiintrin.h
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
new file mode 100644
index 00000000000000..b0b6cb853f713f
--- /dev/null
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 2e674ef6d96c33..f5884c23eedcfc 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -209,6 +209,7 @@
 #define bit_AVXIFMA       0x00800000
 
 /* Features in %edx for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNIINT8   0x00000010
 #define bit_PREFETCHI     0x00004000
 
 /* Features in %eax for leaf 13 sub-leaf 1 */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 00ee91b364aeb4..1204dc700c63a4 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -259,6 +259,11 @@
 #include <gfniintrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNIINT8__)
+#include <avxvnniint8intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 604e3152debada..9a8a6643a6c60b 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4(void) {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxvnni,-avxvnniint8,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/CodeGen/avxvnniint8-builtins.c b/clang/test/CodeGen/avxvnniint8-builtins.c
new file mode 100644
index 00000000000000..cbdf443888a15a
--- /dev/null
+++ b/clang/test/CodeGen/avxvnniint8-builtins.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64- -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -ffreestanding %s -triple=i386-   -target-feature +avxvnniint8 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+// CHECK-LABEL: @test_mm_dpbssd_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128
+__m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbssds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128
+__m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsud_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128
+__m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbsuds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128
+__m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuud_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128
+__m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm_dpbuuds_epi32(
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128
+__m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
+  return _mm_dpbuuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssd_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256
+__m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbssd_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbssds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256
+__m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbssds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsud_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256
+__m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbsud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbsuds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256
+__m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbsuds_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuud_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256
+__m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbuud_epi32(__W, __A, __B);
+}
+
+// CHECK-LABEL: @test_mm256_dpbuuds_epi32(
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256
+__m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return _mm256_dpbuuds_epi32(__W, __A, __B);
+}
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 0954ad135b328d..2c69d7903b0123 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -332,6 +332,11 @@
 // AVXIFMA: "-target-feature" "+avxifma"
 // NO-AVXIFMA: "-target-feature" "-avxifma"
 
+// RUN: %clang --target=i386 -mavxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX-VNNIINT8 %s
+// RUN: %clang --target=i386 -mno-avxvnniint8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX-VNNIINT8 %s
+// AVX-VNNIINT8: "-target-feature" "+avxvnniint8"
+// NO-AVX-VNNIINT8: "-target-feature" "-avxvnniint8"
+
 // RUN: %clang --target=i386 -march=i386 -mcrc32 %s -### 2>&1 | FileCheck -check-prefix=CRC32 %s
 // RUN: %clang --target=i386 -march=i386 -mno-crc32 %s -### 2>&1 | FileCheck -check-prefix=NO-CRC32 %s
 // CRC32: "-target-feature" "+crc32"
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 2db998f4ee822e..46e76a3517afdc 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -620,6 +620,20 @@
 
 // NO-RAOINT-NOT: #define __RAOINT__ 1
 
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8 %s
+
+// AVXVNNIINT8: #define __AVX2__ 1
+// AVXVNNIINT8: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnniint8 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNIINT8 %s
+
+// NOAVXVNNIINT8-NOT: #define __AVXVNNIINT8__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnniint8 -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNIINT8NOAVX2 %s
+
+// AVXVNNIINT8NOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNIINT8NOAVX2-NOT: #define __AVXVNNIINT8__ 1
+
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mcrc32 -x c -E -dM -o - %s | FileCheck -check-prefix=CRC32 %s
 
 // CRC32: #define __CRC32__ 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index d892d92297e4b3..556f7fc7f5449e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -141,6 +141,7 @@ Changes to the X86 Backend
 * Add support for the ``WRMSRNS`` instruction.
 * Support ISA of ``AMX-FP16`` which contains ``tdpfp16ps`` instruction.
 * Support ISA of ``CMPCCXADD``.
+* Support ISA of ``AVX-VNNI-INT8``.
 
 Changes to the OCaml bindings
 -----------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index d45be12cda8770..bc0e4106a410b0 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1929,6 +1929,66 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
               Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                          llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssd_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssd_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbssds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbsuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuud_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuud_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuuds_128
+      : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
+        Intrinsic<[llvm_v4i32_ty],
+                  [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx2_vpdpbuuds_256
+      : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
+        Intrinsic<[llvm_v8i32_ty],
+                  [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
index e8e50acf1771b9..2c656e19d0d198 100644
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -207,6 +207,7 @@ X86_FEATURE       (AMX_FP16,        "amx-fp16")
 X86_FEATURE       (CMPCCXADD,       "cmpccxadd")
 X86_FEATURE       (AVXVNNI,         "avxvnni")
 X86_FEATURE       (AVXIFMA,         "avxifma")
+X86_FEATURE       (AVXVNNIINT8,     "avxvnniint8")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 96663c8d40e43a..d1f01fce62a158 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1812,6 +1812,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["cmpccxadd"]  = HasLeaf7Subleaf1 && ((EAX >> 7) & 1);
   Features["hreset"]     = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);
   Features["avxifma"]    = HasLeaf7Subleaf1 && ((EAX >> 23) & 1) && HasAVXSave;
+  Features["avxvnniint8"] = HasLeaf7Subleaf1 && ((EDX >> 4) & 1) && HasAVXSave;
   Features["prefetchi"]  = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
 
   bool HasLeafD = MaxLevel >= 0xd &&
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index 2b0a3bb881201c..e6074b1e904ec1 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -582,6 +582,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
+constexpr FeatureBitset ImpliedFeaturesAVXVNNIINT8 = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
 constexpr FeatureBitset ImpliedFeaturesCMPCCXADD = {};
 constexpr FeatureBitset ImpliedFeaturesRAOINT = {};
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 0f0cf69ed012e5..a860352acad625 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -187,6 +187,10 @@ def FeatureVP2INTERSECT  : SubtargetFeature<"avx512vp2intersect",
 def FeatureFP16    : SubtargetFeature<"avx512fp16", "HasFP16", "true",
                            "Support 16-bit floating point",
                            [FeatureBWI, FeatureVLX, FeatureDQI]>;
+def FeatureAVXVNNIINT8  : SubtargetFeature<"avxvnniint8",
+                             "HasAVXVNNIINT8", "true",
+                             "Enable AVX-VNNI-INT8",
+                             [FeatureAVX2]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 181171272cce3a..9fd07a7301bb37 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34221,6 +34221,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ENQCMD)
   NODE_NAME_CASE(ENQCMDS)
   NODE_NAME_CASE(VP2INTERSECT)
+  NODE_NAME_CASE(VPDPBSUD)
+  NODE_NAME_CASE(VPDPBSUDS)
+  NODE_NAME_CASE(VPDPBUUD)
+  NODE_NAME_CASE(VPDPBUUDS)
+  NODE_NAME_CASE(VPDPBSSD)
+  NODE_NAME_CASE(VPDPBSSDS)
   NODE_NAME_CASE(AESENC128KL)
   NODE_NAME_CASE(AESDEC128KL)
   NODE_NAME_CASE(AESENC256KL)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 50d9bbf613ff56..ba5f31e03777ad 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -584,6 +584,13 @@ namespace llvm {
     VFCMULCSH,
     VFCMULCSH_RND,
 
+    VPDPBSUD,
+    VPDPBSUDS,
+    VPDPBUUD,
+    VPDPBUUDS,
+    VPDPBSSD,
+    VPDPBSSDS,
+
     // Compress and expand.
     COMPRESS,
     EXPAND,
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 9d58889814037c..f4e24329ff014e 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -3995,6 +3995,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
   { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
   { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBSSDSYrr,               X86::VPDPBSSDSYrm,               0 },
+  { X86::VPDPBSSDSrr,                X86::VPDPBSSDSrm,                0 },
+  { X86::VPDPBSSDYrr,                X86::VPDPBSSDYrm,                0 },
+  { X86::VPDPBSSDrr,                 X86::VPDPBSSDrm,                 0 },
+  { X86::VPDPBSUDSYrr,               X86::VPDPBSUDSYrm,               0 },
+  { X86::VPDPBSUDSrr,                X86::VPDPBSUDSrm,                0 },
+  { X86::VPDPBSUDYrr,                X86::VPDPBSUDYrm,                0 },
+  { X86::VPDPBSUDrr,                 X86::VPDPBSUDrm,                 0 },
   { X86::VPDPBUSDSYrr,               X86::VPDPBUSDSYrm,               0 },
   { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
   { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
@@ -4005,6 +4013,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
   { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
   { X86::VPDPBUSDrr,                 X86::VPDPBUSDrm,                 0 },
+  { X86::VPDPBUUDSYrr,               X86::VPDPBUUDSYrm,               0 },
+  { X86::VPDPBUUDSrr,                X86::VPDPBUUDSrm,                0 },
+  { X86::VPDPBUUDYrr,                X86::VPDPBUUDYrm,                0 },
+  { X86::VPDPBUUDrr,                 X86::VPDPBUUDrm,                 0 },
   { X86::VPDPWSSDSYrr,               X86::VPDPWSSDSYrm,               0 },
   { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
   { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 65c1adc38d7dcb..774f7e92ebb31f 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -813,6 +813,13 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [       // masked store
   SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
 ]>;
 
+def X86vpdpbssd  : SDNode<"X86ISD::VPDPBSSD",  SDTVnni>;
+def X86vpdpbssds : SDNode<"X86ISD::VPDPBSSDS", SDTVnni>;
+def X86vpdpbsud  : SDNode<"X86ISD::VPDPBSUD",  SDTVnni>;
+def X86vpdpbsuds : SDNode<"X86ISD::VPDPBSUDS", SDTVnni>;
+def X86vpdpbuud  : SDNode<"X86ISD::VPDPBUUD",  SDTVnni>;
+def X86vpdpbuuds : SDNode<"X86ISD::VPDPBUUDS", SDTVnni>;
+
 //===----------------------------------------------------------------------===//
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4f17635b42df17..4668a35c76cb33 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2555,6 +2555,14 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDSYrr:
   case X86::VPDPWSSDSrr:
+  case X86::VPDPBSSDSrr:
+  case X86::VPDPBSSDSYrr:
+  case X86::VPDPBSSDrr:
+  case X86::VPDPBSSDYrr:
+  case X86::VPDPBUUDSrr:
+  case X86::VPDPBUUDSYrr:
+  case X86::VPDPBUUDrr:
+  case X86::VPDPBUUDYrr:
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128rk:
   case X86::VPDPWSSDZ128rkz:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 8e038f4c00a304..9ce2afec869b41 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -921,6 +921,7 @@ def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
 def HasFP16      : Predicate<"Subtarget->hasFP16()">;
+def HasAVXVNNIINT8 : Predicate<"Subtarget->hasAVXVNNIINT8()">;
 def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
 def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index a34eeb60f7ed57..99d575b570bb13 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -8160,3 +8160,62 @@ multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
 
 defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
 defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
+
+let Constraints = "$src1 = $dst" in
+multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
+                          RegisterClass RC, PatFrag MemOpFrag,
+                          X86MemOperand X86memop, SDNode OpNode,
+                          X86FoldableSchedWrite Sched,
+                          bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  :  I<Opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2, RC:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
+             VEX_4V, Sched<[Sched]>;
+  def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2, X86memop:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                   (MemOpFrag addr:$src3))))]>,
+             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
+}
+
+let Predicates = [HasAVXVNNIINT8] in {
+  defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
+                                   1>, T8XD;
+  defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8XD;
+  defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
+                                   1>, T8PS;
+  defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8PS;
+  defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
+                                   1>, T8XD;
+  defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8XD;
+  defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
+                                   1>, T8PS;
+  defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
+                                   1>, VEX_L, T8PS;
+  defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
+                                   0>, T8XS;
+  defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
+                                   0>,  VEX_L, T8XS;
+  defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
+                                   i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
+                                   0>, T8XS;
+  defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
+                                   i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
+                                   0>, VEX_L, T8XS;
+}
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 6112c0b7d6c3e0..3bb2f07b5f1a13 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -415,6 +415,18 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
   X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssd_128,  INTR_TYPE_3OP, X86ISD::VPDPBSSD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssd_256,  INTR_TYPE_3OP, X86ISD::VPDPBSSD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssds_128, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbssds_256, INTR_TYPE_3OP, X86ISD::VPDPBSSDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsud_128,  INTR_TYPE_3OP, X86ISD::VPDPBSUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsud_256,  INTR_TYPE_3OP, X86ISD::VPDPBSUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsuds_128, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbsuds_256, INTR_TYPE_3OP, X86ISD::VPDPBSUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuud_128,  INTR_TYPE_3OP, X86ISD::VPDPBUUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuud_256,  INTR_TYPE_3OP, X86ISD::VPDPBUUD,  0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuuds_128, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
+  X86_INTRINSIC_DATA(avx2_vpdpbuuds_256, INTR_TYPE_3OP, X86ISD::VPDPBUUDS, 0),
   X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_conflict_d_128, INTR_TYPE_1OP, X86ISD::CONFLICT, 0),
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
new file mode 100644
index 00000000000000..5c17079519116a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X64
+
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbssd (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x18]
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbssd (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x50,0x1f]
+; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbssds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x18]
+; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbssds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x73,0x51,0x1f]
+; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbssd (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x18]
+; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbssd (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x50,0x1f]
+; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbssds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x18]
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbssds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x77,0x51,0x1f]
+; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbsud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x18]
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbsud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x50,0x1f]
+; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbsuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x18]
+; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbsuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x72,0x51,0x1f]
+; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbsud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x18]
+; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbsud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x50,0x1f]
+; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbsuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x18]
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbsuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x76,0x51,0x1f]
+; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbuud (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x18]
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbuud (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x50,0x1f]
+; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X86-NEXT:    vpdpbuuds (%eax), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x18]
+; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X86-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %xmm0, %xmm3 # encoding: [0xc5,0xf8,0x28,0xd8]
+; X64-NEXT:    vpdpbuuds (%rdi), %xmm1, %xmm3 # encoding: [0xc4,0xe2,0x70,0x51,0x1f]
+; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <4 x i32>, <4 x i32>* %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %res = add <4 x i32> %1, %2
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbuud (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x18]
+; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbuud (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x50,0x1f]
+; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X86-NEXT:    vpdpbuuds (%eax), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x18]
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X86-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps %ymm0, %ymm3 # encoding: [0xc5,0xfc,0x28,0xd8]
+; X64-NEXT:    vpdpbuuds (%rdi), %ymm1, %ymm3 # encoding: [0xc4,0xe2,0x74,0x51,0x1f]
+; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # encoding: [0xc5,0xe5,0xfe,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %x2 = load <8 x i32>, <8 x i32>* %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %res = add <8 x i32> %1, %2
+  ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
new file mode 100644
index 00000000000000..fd988f7d318fec
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    vpdpbsud %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    vpdpbsud %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    vpdpbsuds %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    vpdpbsuds %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt
new file mode 100644
index 00000000000000..63218ef55e57f0
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-32.txt
@@ -0,0 +1,243 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vpdpbssd %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x50,0xd4
+
+# ATT:   vpdpbssd %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x50,0xd4
+
+# ATT:   vpdpbssd  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x50,0x10
+
+# ATT:   vpdpbssd  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssd  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x50,0x10
+
+# ATT:   vpdpbssd  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymm4
+0xc4,0xe2,0x67,0x51,0xd4
+
+# ATT:   vpdpbssds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmm4
+0xc4,0xe2,0x63,0x51,0xd4
+
+# ATT:   vpdpbssds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x67,0x51,0x10
+
+# ATT:   vpdpbssds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x63,0x51,0x10
+
+# ATT:   vpdpbssds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x50,0xd4
+
+# ATT:   vpdpbsud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x50,0xd4
+
+# ATT:   vpdpbsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x50,0x10
+
+# ATT:   vpdpbsud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x50,0x10
+
+# ATT:   vpdpbsud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x66,0x51,0xd4
+
+# ATT:   vpdpbsuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x62,0x51,0xd4
+
+# ATT:   vpdpbsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x66,0x51,0x10
+
+# ATT:   vpdpbsuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x62,0x51,0x10
+
+# ATT:   vpdpbsuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuud %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x50,0xd4
+
+# ATT:   vpdpbuud %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x50,0xd4
+
+# ATT:   vpdpbuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x50,0x10
+
+# ATT:   vpdpbuud  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x50,0x10
+
+# ATT:   vpdpbuud  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuuds %ymm4, %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymm4
+0xc4,0xe2,0x64,0x51,0xd4
+
+# ATT:   vpdpbuuds %xmm4, %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmm4
+0xc4,0xe2,0x60,0x51,0xd4
+
+# ATT:   vpdpbuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%edi,%eax,4), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%eax), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+0xc4,0xe2,0x64,0x51,0x10
+
+# ATT:   vpdpbuuds  -1024(,%ebp,2), %ymm3, %ymm2
+# INTEL: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%edi,%eax,4), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%eax), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+0xc4,0xe2,0x60,0x51,0x10
+
+# ATT:   vpdpbuuds  -512(,%ebp,2), %xmm3, %xmm2
+# INTEL: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff
+
diff --git a/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt
new file mode 100644
index 00000000000000..9b30275e61ad9a
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx-vnni-int8-64.txt
@@ -0,0 +1,243 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vpdpbssd %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymm14
+0xc4,0x42,0x17,0x50,0xe6
+
+# ATT:   vpdpbssd %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmm14
+0xc4,0x42,0x13,0x50,0xe6
+
+# ATT:   vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssd  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbssds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymm14
+0xc4,0x42,0x17,0x51,0xe6
+
+# ATT:   vpdpbssds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmm14
+0xc4,0x42,0x13,0x51,0xe6
+
+# ATT:   vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbssds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsud %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymm14
+0xc4,0x42,0x16,0x50,0xe6
+
+# ATT:   vpdpbsud %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmm14
+0xc4,0x42,0x12,0x50,0xe6
+
+# ATT:   vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsud  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbsuds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymm14
+0xc4,0x42,0x16,0x51,0xe6
+
+# ATT:   vpdpbsuds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmm14
+0xc4,0x42,0x12,0x51,0xe6
+
+# ATT:   vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbsuds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuud %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymm14
+0xc4,0x42,0x14,0x50,0xe6
+
+# ATT:   vpdpbuud %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmm14
+0xc4,0x42,0x10,0x50,0xe6
+
+# ATT:   vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuud  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff
+
+# ATT:   vpdpbuuds %ymm14, %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymm14
+0xc4,0x42,0x14,0x51,0xe6
+
+# ATT:   vpdpbuuds %xmm14, %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmm14
+0xc4,0x42,0x10,0x51,0xe6
+
+# ATT:   vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%rip), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+# INTEL: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff
+
+# ATT:   vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   vpdpbuuds  (%rip), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00
+
+# ATT:   vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+# INTEL: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-att.s b/llvm/test/MC/X86/avx_vnni_int8-32-att.s
new file mode 100644
index 00000000000000..3ade562079eb3f
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-32-att.s
@@ -0,0 +1,241 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+     vpdpbssd %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbssd %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+     vpdpbssd %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbssd  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbssd  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbssd  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10]
+     vpdpbssd  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbssd  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbssd  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbssd  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbssd  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10]
+     vpdpbssd  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbssd  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbssds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+     vpdpbssds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbssds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+     vpdpbssds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbssds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbssds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbssds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10]
+     vpdpbssds  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbssds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbssds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbssds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbssds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10]
+     vpdpbssds  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbssds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbsud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+     vpdpbsud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbsud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+     vpdpbsud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbsud  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbsud  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10]
+     vpdpbsud  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbsud  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbsud  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbsud  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10]
+     vpdpbsud  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbsud  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+     vpdpbsuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbsuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+     vpdpbsuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10]
+     vpdpbsuds  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10]
+     vpdpbsuds  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbsuds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbuud %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+     vpdpbuud %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbuud %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+     vpdpbuud %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbuud  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbuud  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10]
+     vpdpbuud  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbuud  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbuud  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbuud  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10]
+     vpdpbuud  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbuud  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud  -512(,%ebp,2), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds %ymm4, %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+     vpdpbuuds %ymm4, %ymm3, %ymm2
+
+// CHECK: vpdpbuuds %xmm4, %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+     vpdpbuuds %xmm4, %xmm3, %xmm2
+
+// CHECK: vpdpbuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%esp,%esi,8), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds  291(%edi,%eax,4), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%edi,%eax,4), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds  (%eax), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10]
+     vpdpbuuds  (%eax), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds  -1024(,%ebp,2), %ymm3, %ymm2
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds  -1024(,%ebp,2), %ymm3, %ymm2
+
+// CHECK: vpdpbuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%esp,%esi,8), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds  291(%edi,%eax,4), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%edi,%eax,4), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds  (%eax), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10]
+     vpdpbuuds  (%eax), %xmm3, %xmm2
+
+// CHECK: vpdpbuuds  -512(,%ebp,2), %xmm3, %xmm2
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds  -512(,%ebp,2), %xmm3, %xmm2
diff --git a/llvm/test/MC/X86/avx_vnni_int8-32-intel.s b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s
new file mode 100644
index 00000000000000..aec8ff1c9e5da7
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-32-intel.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0xd4]
+     vpdpbssd ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssd xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0xd4]
+     vpdpbssd xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x10]
+     vpdpbssd ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssd xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssd xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x10]
+     vpdpbssd xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0xd4]
+     vpdpbssds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbssds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0xd4]
+     vpdpbssds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x10]
+     vpdpbssds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x67,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbssds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbssds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x10]
+     vpdpbssds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x63,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0xd4]
+     vpdpbsud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0xd4]
+     vpdpbsud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x10]
+     vpdpbsud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x10]
+     vpdpbsud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0xd4]
+     vpdpbsuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0xd4]
+     vpdpbsuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x10]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x66,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x10]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x62,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0xd4]
+     vpdpbuud ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuud xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0xd4]
+     vpdpbuud xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x10]
+     vpdpbuud ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x50,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuud xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuud xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x10]
+     vpdpbuud xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x50,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0xd4]
+     vpdpbuuds ymm2, ymm3, ymm4
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0xd4]
+     vpdpbuuds xmm2, xmm3, xmm4
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x10]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [eax]
+
+// CHECK: vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x64,0x51,0x14,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds ymm2, ymm3, ymmword ptr [2*ebp - 1024]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0xf4,0x00,0x00,0x00,0x10]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x94,0x87,0x23,0x01,0x00,0x00]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x10]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [eax]
+
+// CHECK: vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x60,0x51,0x14,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds xmm2, xmm3, xmmword ptr [2*ebp - 512]
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-att.s b/llvm/test/MC/X86/avx_vnni_int8-64-att.s
new file mode 100644
index 00000000000000..92b490a9be8ef4
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-64-att.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-encoding < %s  | FileCheck %s
+
+// CHECK: vpdpbssd %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6]
+     vpdpbssd %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssd %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6]
+     vpdpbssd %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbssds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6]
+     vpdpbssds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbssds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6]
+     vpdpbssds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6]
+     vpdpbsud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6]
+     vpdpbsud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6]
+     vpdpbsuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbsuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6]
+     vpdpbsuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuud %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6]
+     vpdpbuud %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuud %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6]
+     vpdpbuud %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud  -512(,%rbp,2), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds %ymm14, %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6]
+     vpdpbuuds %ymm14, %ymm13, %ymm12
+
+// CHECK: vpdpbuuds %xmm14, %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6]
+     vpdpbuuds %xmm14, %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%rbp,%r14,8), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%r8,%rax,4), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  (%rip), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds  (%rip), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds  -1024(,%rbp,2), %ymm13, %ymm12
+
+// CHECK: vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds  268435456(%rbp,%r14,8), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds  291(%r8,%rax,4), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  (%rip), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds  (%rip), %xmm13, %xmm12
+
+// CHECK: vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds  -512(,%rbp,2), %xmm13, %xmm12
+
diff --git a/llvm/test/MC/X86/avx_vnni_int8-64-intel.s b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s
new file mode 100644
index 00000000000000..0d5b066e710f26
--- /dev/null
+++ b/llvm/test/MC/X86/avx_vnni_int8-64-intel.s
@@ -0,0 +1,242 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnniint8 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vpdpbssd ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xe6]
+     vpdpbssd ymm12, ymm13, ymm14
+
+// CHECK: vpdpbssd xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xe6]
+     vpdpbssd xmm12, xmm13, xmm14
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x17,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x17,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x17,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssd ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x13,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssd xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x13,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssd xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssd xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x13,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssd xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xe6]
+     vpdpbssds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbssds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xe6]
+     vpdpbssds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x17,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x17,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x17,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbssds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x13,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbssds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x13,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbssds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbssds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x13,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbssds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xe6]
+     vpdpbsud ymm12, ymm13, ymm14
+
+// CHECK: vpdpbsud xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xe6]
+     vpdpbsud xmm12, xmm13, xmm14
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xe6]
+     vpdpbsuds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xe6]
+     vpdpbsuds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x16,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x16,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x16,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbsuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x12,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbsuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x12,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbsuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbsuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x12,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbsuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xe6]
+     vpdpbuud ymm12, ymm13, ymm14
+
+// CHECK: vpdpbuud xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xe6]
+     vpdpbuud xmm12, xmm13, xmm14
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0x50,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuud ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0x50,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuud xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0x50,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuud xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuud xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0x50,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuud xmm12, xmm13, xmmword ptr [2*rbp - 512]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymm14
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xe6]
+     vpdpbuuds ymm12, ymm13, ymm14
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmm14
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xe6]
+     vpdpbuuds xmm12, xmm13, xmm14
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x14,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds ymm12, ymm13, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x14,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds ymm12, ymm13, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds ymm12, ymm13, ymmword ptr [rip]
+
+// CHECK: vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0x62,0x14,0x51,0x24,0x6d,0x00,0xfc,0xff,0xff]
+     vpdpbuuds ymm12, ymm13, ymmword ptr [2*rbp - 1024]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0x22,0x10,0x51,0xa4,0xf5,0x00,0x00,0x00,0x10]
+     vpdpbuuds xmm12, xmm13, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0x42,0x10,0x51,0xa4,0x80,0x23,0x01,0x00,0x00]
+     vpdpbuuds xmm12, xmm13, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x25,0x00,0x00,0x00,0x00]
+     vpdpbuuds xmm12, xmm13, xmmword ptr [rip]
+
+// CHECK: vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0x62,0x10,0x51,0x24,0x6d,0x00,0xfe,0xff,0xff]
+     vpdpbuuds xmm12, xmm13, xmmword ptr [2*rbp - 512]
+