18
18
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
19
19
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
20
20
21
+ /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
22
+ /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
23
+ /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
24
+ /// in \a S, and store the packed 32-bit results in DST.
25
+ ///
26
+ /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
27
+ ///
28
+ /// \operation
29
+ /// FOR j := 0 to 7
30
+ /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
31
+ /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
32
+ /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
33
+ /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
34
+ /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
35
+ /// ENDFOR
36
+ /// DST[MAX:256] := 0
37
+ /// \endoperation
38
+ #define _mm256_dpbusd_epi32 (S , A , B ) \
39
+ (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
21
40
22
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
23
- _mm256_dpbusd_epi32 (__m256i __S , __m256i __A , __m256i __B )
24
- {
25
- return (__m256i )__builtin_ia32_vpdpbusd256 ((__v8si )__S , (__v8si )__A ,
26
- (__v8si )__B );
27
- }
41
+ /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
42
+ /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
43
+ /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
44
+ /// in \a S using signed saturation, and store the packed 32-bit results in DST.
45
+ ///
46
+ /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
47
+ ///
48
+ /// \operation
49
+ /// FOR j := 0 to 7
50
+ /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
51
+ /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
52
+ /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
53
+ /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
54
+ /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
55
+ /// ENDFOR
56
+ /// DST[MAX:256] := 0
57
+ /// \endoperation
58
+ #define _mm256_dpbusds_epi32 (S , A , B ) \
59
+ (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
60
+
61
+ /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
62
+ /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
63
+ /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
64
+ /// and store the packed 32-bit results in DST.
65
+ ///
66
+ /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
67
+ ///
68
+ /// \operation
69
+ /// FOR j := 0 to 7
70
+ /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
71
+ /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
72
+ /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
73
+ /// ENDFOR
74
+ /// DST[MAX:256] := 0
75
+ /// \endoperation
76
+ #define _mm256_dpwssd_epi32 (S , A , B ) \
77
+ (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
78
+
79
+ /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
80
+ /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
81
+ /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
82
+ /// using signed saturation, and store the packed 32-bit results in DST.
83
+ ///
84
+ /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
85
+ ///
86
+ /// \operation
87
+ /// FOR j := 0 to 7
88
+ /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
89
+ /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
90
+ /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
91
+ /// ENDFOR
92
+ /// DST[MAX:256] := 0
93
+ /// \endoperation
94
+ #define _mm256_dpwssds_epi32 (S , A , B ) \
95
+ (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
96
+
97
+ /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
98
+ /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
99
+ /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
100
+ /// in \a S, and store the packed 32-bit results in DST.
101
+ ///
102
+ /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
103
+ ///
104
+ /// \operation
105
+ /// FOR j := 0 to 3
106
+ /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
107
+ /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
108
+ /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
109
+ /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
110
+ /// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
111
+ /// ENDFOR
112
+ /// DST[MAX:128] := 0
113
+ /// \endoperation
114
+ #define _mm_dpbusd_epi32 (S , A , B ) \
115
+ (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
116
+
117
+ /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
118
+ /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
119
+ /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
120
+ /// in \a S using signed saturation, and store the packed 32-bit results in DST.
121
+ ///
122
+ /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
123
+ ///
124
+ /// \operation
125
+ /// FOR j := 0 to 3
126
+ /// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
127
+ /// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
128
+ /// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
129
+ /// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
130
+ /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
131
+ /// ENDFOR
132
+ /// DST[MAX:128] := 0
133
+ /// \endoperation
134
+ #define _mm_dpbusds_epi32 (S , A , B ) \
135
+ (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
136
+
137
+ /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
138
+ /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
139
+ /// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
140
+ /// and store the packed 32-bit results in DST.
141
+ ///
142
+ /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
143
+ ///
144
+ /// \operation
145
+ /// FOR j := 0 to 3
146
+ /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
147
+ /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
148
+ /// DST.dword[j] := S.dword[j] + tmp1 + tmp2
149
+ /// ENDFOR
150
+ /// DST[MAX:128] := 0
151
+ /// \endoperation
152
+ #define _mm_dpwssd_epi32 (S , A , B ) \
153
+ (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
154
+
155
+ /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
156
+ /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
157
+ /// results. Sum these 2 results with the corresponding 32-bit integer in \a S
158
+ /// using signed saturation, and store the packed 32-bit results in DST.
159
+ ///
160
+ /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
161
+ ///
162
+ /// \operation
163
+ /// FOR j := 0 to 3
164
+ /// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
165
+ /// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
166
+ /// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
167
+ /// ENDFOR
168
+ /// DST[MAX:128] := 0
169
+ /// \endoperation
170
+ #define _mm_dpwssds_epi32 (S , A , B ) \
171
+ (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
28
172
29
173
static __inline__ __m256i __DEFAULT_FN_ATTRS256
30
174
_mm256_mask_dpbusd_epi32 (__m256i __S , __mmask8 __U , __m256i __A , __m256i __B )
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
42
186
(__v8si )_mm256_setzero_si256 ());
43
187
}
44
188
45
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
46
- _mm256_dpbusds_epi32 (__m256i __S , __m256i __A , __m256i __B )
47
- {
48
- return (__m256i )__builtin_ia32_vpdpbusds256 ((__v8si )__S , (__v8si )__A ,
49
- (__v8si )__B );
50
- }
51
-
52
189
static __inline__ __m256i __DEFAULT_FN_ATTRS256
53
190
_mm256_mask_dpbusds_epi32 (__m256i __S , __mmask8 __U , __m256i __A , __m256i __B )
54
191
{
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
65
202
(__v8si )_mm256_setzero_si256 ());
66
203
}
67
204
68
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
69
- _mm256_dpwssd_epi32 (__m256i __S , __m256i __A , __m256i __B )
70
- {
71
- return (__m256i )__builtin_ia32_vpdpwssd256 ((__v8si )__S , (__v8si )__A ,
72
- (__v8si )__B );
73
- }
74
-
75
205
static __inline__ __m256i __DEFAULT_FN_ATTRS256
76
206
_mm256_mask_dpwssd_epi32 (__m256i __S , __mmask8 __U , __m256i __A , __m256i __B )
77
207
{
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
88
218
(__v8si )_mm256_setzero_si256 ());
89
219
}
90
220
91
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
92
- _mm256_dpwssds_epi32 (__m256i __S , __m256i __A , __m256i __B )
93
- {
94
- return (__m256i )__builtin_ia32_vpdpwssds256 ((__v8si )__S , (__v8si )__A ,
95
- (__v8si )__B );
96
- }
97
-
98
221
static __inline__ __m256i __DEFAULT_FN_ATTRS256
99
222
_mm256_mask_dpwssds_epi32 (__m256i __S , __mmask8 __U , __m256i __A , __m256i __B )
100
223
{
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
111
234
(__v8si )_mm256_setzero_si256 ());
112
235
}
113
236
114
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
115
- _mm_dpbusd_epi32 (__m128i __S , __m128i __A , __m128i __B )
116
- {
117
- return (__m128i )__builtin_ia32_vpdpbusd128 ((__v4si )__S , (__v4si )__A ,
118
- (__v4si )__B );
119
- }
120
-
121
237
static __inline__ __m128i __DEFAULT_FN_ATTRS128
122
238
_mm_mask_dpbusd_epi32 (__m128i __S , __mmask8 __U , __m128i __A , __m128i __B )
123
239
{
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
134
250
(__v4si )_mm_setzero_si128 ());
135
251
}
136
252
137
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
138
- _mm_dpbusds_epi32 (__m128i __S , __m128i __A , __m128i __B )
139
- {
140
- return (__m128i )__builtin_ia32_vpdpbusds128 ((__v4si )__S , (__v4si )__A ,
141
- (__v4si )__B );
142
- }
143
-
144
253
static __inline__ __m128i __DEFAULT_FN_ATTRS128
145
254
_mm_mask_dpbusds_epi32 (__m128i __S , __mmask8 __U , __m128i __A , __m128i __B )
146
255
{
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
157
266
(__v4si )_mm_setzero_si128 ());
158
267
}
159
268
160
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
161
- _mm_dpwssd_epi32 (__m128i __S , __m128i __A , __m128i __B )
162
- {
163
- return (__m128i )__builtin_ia32_vpdpwssd128 ((__v4si )__S , (__v4si )__A ,
164
- (__v4si )__B );
165
- }
166
-
167
269
static __inline__ __m128i __DEFAULT_FN_ATTRS128
168
270
_mm_mask_dpwssd_epi32 (__m128i __S , __mmask8 __U , __m128i __A , __m128i __B )
169
271
{
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
180
282
(__v4si )_mm_setzero_si128 ());
181
283
}
182
284
183
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
184
- _mm_dpwssds_epi32 (__m128i __S , __m128i __A , __m128i __B )
185
- {
186
- return (__m128i )__builtin_ia32_vpdpwssds128 ((__v4si )__S , (__v4si )__A ,
187
- (__v4si )__B );
188
- }
189
-
190
285
static __inline__ __m128i __DEFAULT_FN_ATTRS128
191
286
_mm_mask_dpwssds_epi32 (__m128i __S , __mmask8 __U , __m128i __A , __m128i __B )
192
287
{
0 commit comments