diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 5346a0209928e..737cf9897d826 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -19,22 +19,112 @@ #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) /* SSE4 Multiple Packed Sums of Absolute Difference. */ +/// Computes sixteen sum of absolute difference (SAD) operations on sets of +/// four unsigned 8-bit integers from the 256-bit integer vectors \a X and +/// \a Y. +/// +/// Eight SAD results are computed using the lower half of the input +/// vectors, and another eight using the upper half. These 16-bit values +/// are returned in the lower and upper halves of the 256-bit result, +/// respectively. +/// +/// A single SAD operation selects four bytes from \a X and four bytes from +/// \a Y as input. It computes the differences between each \a X byte and +/// the corresponding \a Y byte, takes the absolute value of each +/// difference, and sums these four values to form one 16-bit result. The +/// intrinsic computes 16 of these results with different sets of input +/// bytes. +/// +/// For each set of eight results, the SAD operations use the same four +/// bytes from \a Y; the starting bit position for these four bytes is +/// specified by \a M[1:0] times 32. The eight operations use successive +/// sets of four bytes from \a X; the starting bit position for the first +/// set of four bytes is specified by \a M[2] times 32. These bit positions +/// are all relative to the 128-bit lane for each set of eight operations. +/// +/// \code{.operation} +/// r := 0 +/// FOR i := 0 TO 1 +/// j := i*3 +/// Ybase := M[j+1:j]*32 + i*128 +/// Xbase := M[j+2]*32 + i*128 +/// FOR k := 0 TO 3 +/// temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase]) +/// temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8]) +/// temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16]) +/// temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24]) +/// result[r+15:r] := temp0 + temp1 + temp2 + temp3 +/// Xbase := Xbase + 8 +/// r := r + 16 +/// ENDFOR +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// \code +/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the \c VMPSADBW instruction. +/// +/// \param X +/// A 256-bit integer vector containing one of the inputs. +/// \param Y +/// A 256-bit integer vector containing one of the inputs. +/// \param M +/// An unsigned immediate value specifying the starting positions of the +/// bytes to operate on. +/// \returns A 256-bit vector of [16 x i16] containing the result. #define _mm256_mpsadbw_epu8(X, Y, M) \ ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ (__v32qi)(__m256i)(Y), (int)(M))) +/// Computes the absolute value of each signed byte in the 256-bit integer +/// vector \a __a and returns each value in the corresponding byte of +/// the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a) { return (__m256i)__builtin_elementwise_abs((__v32qs)__a); } +/// Computes the absolute value of each signed 16-bit element in the 256-bit +/// vector of [16 x i16] in \a __a and returns each value in the +/// corresponding element of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a) { return (__m256i)__builtin_elementwise_abs((__v16hi)__a); } +/// Computes the absolute value of each signed 32-bit element in the 256-bit +/// vector of [8 x i32] in \a __a and returns each value in the +/// corresponding element of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a) { @@ -345,24 +435,88 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(b), (n))) +/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and +/// \a __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPAND instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a & (__v4du)__b); } +/// Computes the bitwise AND of the 256-bit integer vector in \a __b with +/// the bitwise NOT of the 256-bit integer vector in \a __a. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPANDN instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256(__m256i __a, __m256i __b) { return (__m256i)(~(__v4du)__a & (__v4du)__b); } +/// Computes the averages of the corresponding unsigned bytes in the two +/// 256-bit integer vectors in \a __a and \a __b and returns each +/// average in the corresponding byte of the 256-bit result. +/// +/// \code{.operation} +/// FOR i := 0 TO 31 +/// j := i*8 +/// result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1 +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPAVGB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); } +/// Computes the averages of the corresponding unsigned 16-bit integers in +/// the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns +/// each average in the corresponding element of the 256-bit result. +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1 +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPAVGW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b) { @@ -765,72 +919,228 @@ _mm256_madd_epi16(__m256i __a, __m256i __b) return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); } +/// Compares the corresponding signed bytes in the two 256-bit integer vectors +/// in \a __a and \a __b and returns the larger of each pair in the +/// corresponding byte of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXSB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); } +/// Compares the corresponding signed 16-bit integers in the two 256-bit +/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); } +/// Compares the corresponding signed 32-bit integers in the two 256-bit +/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXSD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); } +/// Compares the corresponding unsigned bytes in the two 256-bit integer +/// vectors in \a __a and \a __b and returns the larger of each pair in +/// the corresponding byte of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXUB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); } +/// Compares the corresponding unsigned 16-bit integers in the two 256-bit +/// vectors of [16 x i16] in \a __a and \a __b and returns the larger of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXUW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); } +/// Compares the corresponding unsigned 32-bit integers in the two 256-bit +/// vectors of [8 x i32] in \a __a and \a __b and returns the larger of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMAXUD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); } +/// Compares the corresponding signed bytes in the two 256-bit integer vectors +/// in \a __a and \a __b and returns the smaller of each pair in the +/// corresponding byte of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINSB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); } +/// Compares the corresponding signed 16-bit integers in the two 256-bit +/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); } +/// Compares the corresponding signed 32-bit integers in the two 256-bit +/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINSD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); } +/// Compares the corresponding unsigned bytes in the two 256-bit integer +/// vectors in \a __a and \a __b and returns the smaller of each pair in +/// the corresponding byte of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINUB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); } +/// Compares the corresponding unsigned 16-bit integers in the two 256-bit +/// vectors of [16 x i16] in \a __a and \a __b and returns the smaller of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINUW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); } +/// Compares the corresponding unsigned 32-bit integers in the two 256-bit +/// vectors of [8 x i32] in \a __a and \a __b and returns the smaller of +/// each pair in the corresponding element of the 256-bit result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMINUD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a, __m256i __b) { @@ -1076,12 +1386,63 @@ _mm256_mul_epu32(__m256i __a, __m256i __b) return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); } +/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and +/// \a __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPOR instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a | (__v4du)__b); } +/// Computes four sum of absolute difference (SAD) operations on sets of eight +/// unsigned 8-bit integers from the 256-bit integer vectors \a __a and +/// \a __b. +/// +/// One SAD result is computed for each set of eight bytes from \a __a and +/// eight bytes from \a __b. The zero-extended SAD value is returned in the +/// corresponding 64-bit element of the result. +/// +/// A single SAD operation takes the differences between the corresponding +/// bytes of \a __a and \a __b, takes the absolute value of each difference, +/// and sums these eight values to form one 16-bit result. This operation +/// is repeated four times with successive sets of eight bytes. +/// +/// \code{.operation} +/// FOR i := 0 TO 3 +/// j := i*64 +/// temp0 := ABS(__a[j+7:j] - __b[j+7:j]) +/// temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8]) +/// temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16]) +/// temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24]) +/// temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32]) +/// temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40]) +/// temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48]) +/// temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56]) +/// result[j+15:j] := temp0 + temp1 + temp2 + temp3 + +/// temp4 + temp5 + temp6 + temp7 +/// result[j+63:j+16] := 0 +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSADBW instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b) { @@ -1233,18 +1594,63 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) #define _mm256_shufflelo_epi16(a, imm) \ ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) +/// Sets each byte of the result to the corresponding byte of the 256-bit +/// integer vector in \a __a, the negative of that byte, or zero, depending +/// on whether the corresponding byte of the 256-bit integer vector in +/// \a __b is greater than zero, less than zero, or equal to zero, +/// respectively. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGNB instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector]. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); } +/// Sets each element of the result to the corresponding element of the +/// 256-bit vector of [16 x i16] in \a __a, the negative of that element, +/// or zero, depending on whether the corresponding element of the 256-bit +/// vector of [16 x i16] in \a __b is greater than zero, less than zero, or +/// equal to zero, respectively. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGNW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16]. +/// \param __b +/// A 256-bit vector of [16 x i16]. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); } +/// Sets each element of the result to the corresponding element of the +/// 256-bit vector of [8 x i32] in \a __a, the negative of that element, or +/// zero, depending on whether the corresponding element of the 256-bit +/// vector of [8 x i32] in \a __b is greater than zero, less than zero, or +/// equal to zero, respectively. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGND instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32]. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b) { @@ -2132,6 +2538,18 @@ _mm256_unpacklo_epi64(__m256i __a, __m256i __b) return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); } +/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and +/// \a __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPXOR instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns A 256-bit integer vector containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256(__m256i __a, __m256i __b) {