Skip to content

Commit 756f597

Browse files
committed
[X86] Support Intel avxvnni
This patch mainly made the following changes: 1. Support AVX-VNNI instructions; 2. Introduce ExplicitVEXPrefix flag so that vpdpbusd/vpdpbusds/vpdpbusds/vpdpbusds instructions only use vex-encoding when user explicity add {vex} prefix. Differential Revision: https://reviews.llvm.org/D89105
1 parent d11710d commit 756f597

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2603
-74
lines changed

clang/docs/ClangCommandLineReference.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3253,6 +3253,8 @@ X86
32533253

32543254
.. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq
32553255

3256+
.. option:: -mavxvnni, -mno-avxvnni
3257+
32563258
.. option:: -mbmi, -mno-bmi
32573259

32583260
.. option:: -mbmi2, -mno-bmi2

clang/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ X86 Support in Clang
205205

206206
- Support for ``UINTR`` instructions has been added.
207207

208+
- Support for ``AVXVNNI`` instructions has been added.
209+
208210
Internal API Changes
209211
--------------------
210212

clang/include/clang/Basic/BuiltinsX86.def

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -960,17 +960,17 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
960960
TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
961961
TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
962962

963-
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
964-
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
963+
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
964+
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
965965
TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
966-
TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
967-
TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
966+
TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
967+
TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
968968
TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
969-
TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
970-
TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
969+
TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
970+
TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
971971
TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
972-
TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
973-
TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
972+
TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
973+
TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
974974
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
975975

976976
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")

clang/include/clang/Driver/Options.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3235,6 +3235,8 @@ def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Gro
32353235
def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
32363236
def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
32373237
def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
3238+
def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
3239+
def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
32383240
def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
32393241
def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
32403242
def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;

clang/lib/Basic/Targets/X86.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
306306
HasAMXINT8 = true;
307307
} else if (Feature == "+amx-tile") {
308308
HasAMXTILE = true;
309+
} else if (Feature == "+avxvnni") {
310+
HasAVXVNNI = true;
309311
} else if (Feature == "+serialize") {
310312
HasSERIALIZE = true;
311313
} else if (Feature == "+tsxldtrk") {
@@ -728,6 +730,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
728730
Builder.defineMacro("__AMXINT8__");
729731
if (HasAMXBF16)
730732
Builder.defineMacro("__AMXBF16__");
733+
if (HasAVXVNNI)
734+
Builder.defineMacro("__AVXVNNI__");
731735
if (HasSERIALIZE)
732736
Builder.defineMacro("__SERIALIZE__");
733737
if (HasTSXLDTRK)
@@ -846,6 +850,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
846850
.Case("avx512vbmi2", true)
847851
.Case("avx512ifma", true)
848852
.Case("avx512vp2intersect", true)
853+
.Case("avxvnni", true)
849854
.Case("bmi", true)
850855
.Case("bmi2", true)
851856
.Case("cldemote", true)
@@ -918,6 +923,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
918923
.Case("amx-bf16", HasAMXBF16)
919924
.Case("amx-int8", HasAMXINT8)
920925
.Case("amx-tile", HasAMXTILE)
926+
.Case("avxvnni", HasAVXVNNI)
921927
.Case("avx", SSELevel >= AVX)
922928
.Case("avx2", SSELevel >= AVX2)
923929
.Case("avx512f", SSELevel >= AVX512F)

clang/lib/Basic/Targets/X86.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
130130
bool HasKL = false; // For key locker
131131
bool HasWIDEKL = false; // For wide key locker
132132
bool HasHRESET = false;
133+
bool HasAVXVNNI = false;
133134
bool HasAMXTILE = false;
134135
bool HasAMXINT8 = false;
135136
bool HasAMXBF16 = false;

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ set(files
3535
avx512vnniintrin.h
3636
avx512vlvnniintrin.h
3737
avxintrin.h
38+
avxvnniintrin.h
3839
bmi2intrin.h
3940
bmiintrin.h
4041
__clang_cuda_builtin_vars.h

clang/lib/Headers/avx512vlvnniintrin.h

Lines changed: 150 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,157 @@
1818
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
1919
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
2020

21+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
22+
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
23+
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
24+
/// in \a S, and store the packed 32-bit results in DST.
25+
///
26+
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
27+
///
28+
/// \operation
29+
/// FOR j := 0 to 7
30+
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
31+
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
32+
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
33+
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
34+
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
35+
/// ENDFOR
36+
/// DST[MAX:256] := 0
37+
/// \endoperation
38+
#define _mm256_dpbusd_epi32(S, A, B) \
39+
(__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
2140

22-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
23-
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
24-
{
25-
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
26-
(__v8si)__B);
27-
}
41+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
42+
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
43+
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
44+
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
45+
///
46+
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
47+
///
48+
/// \operation
49+
/// FOR j := 0 to 7
50+
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
51+
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
52+
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
53+
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
54+
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
55+
/// ENDFOR
56+
/// DST[MAX:256] := 0
57+
/// \endoperation
58+
#define _mm256_dpbusds_epi32(S, A, B) \
59+
(__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
60+
61+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
62+
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
63+
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
64+
/// and store the packed 32-bit results in DST.
65+
///
66+
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
67+
///
68+
/// \operation
69+
/// FOR j := 0 to 7
70+
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
71+
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
72+
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
73+
/// ENDFOR
74+
/// DST[MAX:256] := 0
75+
/// \endoperation
76+
#define _mm256_dpwssd_epi32(S, A, B) \
77+
(__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
78+
79+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
80+
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
81+
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
82+
/// using signed saturation, and store the packed 32-bit results in DST.
83+
///
84+
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
85+
///
86+
/// \operation
87+
/// FOR j := 0 to 7
88+
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
89+
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
90+
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
91+
/// ENDFOR
92+
/// DST[MAX:256] := 0
93+
/// \endoperation
94+
#define _mm256_dpwssds_epi32(S, A, B) \
95+
(__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
96+
97+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
98+
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
99+
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
100+
/// in \a S, and store the packed 32-bit results in DST.
101+
///
102+
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
103+
///
104+
/// \operation
105+
/// FOR j := 0 to 3
106+
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
107+
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
108+
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
109+
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
110+
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
111+
/// ENDFOR
112+
/// DST[MAX:128] := 0
113+
/// \endoperation
114+
#define _mm_dpbusd_epi32(S, A, B) \
115+
(__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
116+
117+
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
118+
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
119+
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
120+
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
121+
///
122+
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
123+
///
124+
/// \operation
125+
/// FOR j := 0 to 3
126+
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
127+
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
128+
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
129+
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
130+
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
131+
/// ENDFOR
132+
/// DST[MAX:128] := 0
133+
/// \endoperation
134+
#define _mm_dpbusds_epi32(S, A, B) \
135+
(__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
136+
137+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
138+
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
139+
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
140+
/// and store the packed 32-bit results in DST.
141+
///
142+
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
143+
///
144+
/// \operation
145+
/// FOR j := 0 to 3
146+
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
147+
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
148+
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
149+
/// ENDFOR
150+
/// DST[MAX:128] := 0
151+
/// \endoperation
152+
#define _mm_dpwssd_epi32(S, A, B) \
153+
(__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
154+
155+
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
156+
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
157+
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
158+
/// using signed saturation, and store the packed 32-bit results in DST.
159+
///
160+
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
161+
///
162+
/// \operation
163+
/// FOR j := 0 to 3
164+
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
165+
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
166+
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
167+
/// ENDFOR
168+
/// DST[MAX:128] := 0
169+
/// \endoperation
170+
#define _mm_dpwssds_epi32(S, A, B) \
171+
(__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
28172

29173
static __inline__ __m256i __DEFAULT_FN_ATTRS256
30174
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
42186
(__v8si)_mm256_setzero_si256());
43187
}
44188

45-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
46-
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
47-
{
48-
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
49-
(__v8si)__B);
50-
}
51-
52189
static __inline__ __m256i __DEFAULT_FN_ATTRS256
53190
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
54191
{
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
65202
(__v8si)_mm256_setzero_si256());
66203
}
67204

68-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
69-
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
70-
{
71-
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
72-
(__v8si)__B);
73-
}
74-
75205
static __inline__ __m256i __DEFAULT_FN_ATTRS256
76206
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
77207
{
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
88218
(__v8si)_mm256_setzero_si256());
89219
}
90220

91-
static __inline__ __m256i __DEFAULT_FN_ATTRS256
92-
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
93-
{
94-
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
95-
(__v8si)__B);
96-
}
97-
98221
static __inline__ __m256i __DEFAULT_FN_ATTRS256
99222
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
100223
{
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
111234
(__v8si)_mm256_setzero_si256());
112235
}
113236

114-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
115-
_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
116-
{
117-
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
118-
(__v4si)__B);
119-
}
120-
121237
static __inline__ __m128i __DEFAULT_FN_ATTRS128
122238
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
123239
{
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
134250
(__v4si)_mm_setzero_si128());
135251
}
136252

137-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
138-
_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
139-
{
140-
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
141-
(__v4si)__B);
142-
}
143-
144253
static __inline__ __m128i __DEFAULT_FN_ATTRS128
145254
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
146255
{
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
157266
(__v4si)_mm_setzero_si128());
158267
}
159268

160-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
161-
_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
162-
{
163-
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
164-
(__v4si)__B);
165-
}
166-
167269
static __inline__ __m128i __DEFAULT_FN_ATTRS128
168270
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
169271
{
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
180282
(__v4si)_mm_setzero_si128());
181283
}
182284

183-
static __inline__ __m128i __DEFAULT_FN_ATTRS128
184-
_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
185-
{
186-
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
187-
(__v4si)__B);
188-
}
189-
190285
static __inline__ __m128i __DEFAULT_FN_ATTRS128
191286
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
192287
{

0 commit comments

Comments
 (0)