diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index e218a6a46a147..f1106b97bb382 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -164,6 +164,12 @@ set(ppc_wrapper_files ppc_wrappers/pmmintrin.h ppc_wrappers/tmmintrin.h ppc_wrappers/smmintrin.h + ppc_wrappers/bmiintrin.h + ppc_wrappers/bmi2intrin.h + ppc_wrappers/immintrin.h + ppc_wrappers/tmmintrin.h + ppc_wrappers/x86intrin.h + ppc_wrappers/x86gprintrin.h ) set(openmp_wrapper_files diff --git a/clang/lib/Headers/ppc_wrappers/bmi2intrin.h b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h new file mode 100644 index 0000000000000..433fe0887a071 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h @@ -0,0 +1,133 @@ +/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use directly; include instead." +#endif + +#ifndef BMI2INTRIN_H_ +#define BMI2INTRIN_H_ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u32(unsigned int __X, unsigned int __Y) { + return ((__X << (32 - __Y)) >> (32 - __Y)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { + unsigned long long __res = (unsigned long long)__X * __Y; + *__P = (unsigned int)(__res >> 32); + return (unsigned int)__res; +} + +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u64(unsigned long long __X, unsigned long long __Y) { + return ((__X << (64 - __Y)) >> (64 - __Y)); +} + +/* __int128 requires base 64-bit. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u64(unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) { + unsigned __int128 __res = (unsigned __int128)__X * __Y; + *__P = (unsigned long long)(__res >> 64); + return (unsigned long long)__res; +} + +#ifdef _ARCH_PWR7 +/* popcount and bpermd require power7 minimum. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u64(unsigned long long __X, unsigned long long __M) { + unsigned long result = 0x0UL; + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c, t; + unsigned long p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + p = 64 - __builtin_popcountl(__M); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (m != 0) { + c = __builtin_clzl(m); + t = __X << (p - c); + m ^= (mask >> c); + result |= (t & (mask >> c)); + p++; + } + return (result); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u64(unsigned long long __X, unsigned long long __M) { + unsigned long p = 0x4040404040404040UL; // initial bit permute control + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c; + unsigned long result; + + /* if the mask is constant and selects 8 bits or less we can use + the Power8 Bit permute instruction. */ + if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { + /* Also if the pext mask is constant, then the popcount is + constant, we can evaluate the following loop at compile + time and use a constant bit permute vector. */ + for (long i = 0; i < __builtin_popcountl(__M); i++) { + c = __builtin_clzl(m); + p = (p << 8) | c; + m ^= (mask >> c); + } + result = __builtin_bpermd(p, __X); + } else { + p = 64 - __builtin_popcountl(__M); + result = 0; + /* We could a use a for loop here, but that combined with + -funroll-loops can expand to a lot of code. The while + loop avoids unrolling and the compiler commons the xor + from clearing the mask bit with the (m != 0) test. The + result is a more compact loop setup and body. */ + while (m != 0) { + unsigned long t; + c = __builtin_clzl(m); + t = (__X & (mask >> c)) >> (p - c); + m ^= (mask >> c); + result |= (t); + p++; + } + } + return (result); +} + +/* these 32-bit implementations depend on 64-bit pdep/pext + which depend on _ARCH_PWR7. */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u32(unsigned int __X, unsigned int __Y) { + return _pdep_u64(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u32(unsigned int __X, unsigned int __Y) { + return _pext_u64(__X, __Y); +} +#endif /* _ARCH_PWR7 */ +#endif /* __PPC64__ */ + +#endif /* BMI2INTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/bmiintrin.h b/clang/lib/Headers/ppc_wrappers/bmiintrin.h new file mode 100644 index 0000000000000..7d3315958c7b7 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/bmiintrin.h @@ -0,0 +1,165 @@ +/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use directly; include instead." +#endif + +#ifndef BMIINTRIN_H_ +#define BMIINTRIN_H_ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u16(unsigned short __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u32(unsigned int __X, unsigned int __Y) { + return (~__X & __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) { + return ((__X << (32 - (__L + __P))) >> (32 - __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u32(unsigned int __X, unsigned int __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y >> 8) & 0xFF; + return (_bextr_u32(__X, __P, __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u32(unsigned int __X) { + return (__X & -__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u32(unsigned int __X) { + return __blsi_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u32(unsigned int __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u32(unsigned int __X) { + return __blsmsk_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u32(unsigned int __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u32(unsigned int __X) { + return __blsr_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +/* use the 64-bit shift, rotate, and count leading zeros instructions + for long long. */ +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u64(unsigned long long __X, unsigned long long __Y) { + return (~__X & __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) { + return ((__X << (64 - (__L + __P))) >> (64 - __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u64(unsigned long long __X, unsigned long long __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y & 0xFF00) >> 8; + return (_bextr_u64(__X, __P, __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u64(unsigned long long __X) { + return __X & -__X; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u64(unsigned long long __X) { + return __blsi_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u64(unsigned long long __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u64(unsigned long long __X) { + return __blsmsk_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u64(unsigned long long __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u64(unsigned long long __X) { + return __blsr_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} +#endif /* __PPC64__ */ + +#endif /* BMIINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/emmintrin.h b/clang/lib/Headers/ppc_wrappers/emmintrin.h index 82a71788b27a8..278225f919758 100644 --- a/clang/lib/Headers/ppc_wrappers/emmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/emmintrin.h @@ -405,20 +405,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd (__m128d __A, __m128d __B) { -#if _ARCH_PWR8 __v2du c, d; /* Compare against self will return false (0's) if NAN. */ c = (__v2du)vec_cmpeq (__A, __A); d = (__v2du)vec_cmpeq (__B, __B); -#else - __v2du a, b; - __v2du c, d; - const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; - a = (__v2du)vec_abs ((__v2df)__A); - b = (__v2du)vec_abs ((__v2df)__B); - c = (__v2du)vec_cmpgt (double_exp_mask, a); - d = (__v2du)vec_cmpgt (double_exp_mask, b); -#endif /* A != NAN and B != NAN. */ return ((__m128d)vec_and(c, d)); } @@ -861,7 +851,11 @@ _mm_cvtpd_epi32 (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -896,7 +890,11 @@ _mm_cvtpd_ps (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4sf) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -925,7 +923,11 @@ _mm_cvttpd_epi32 (__m128d __A) : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -1205,6 +1207,9 @@ _mm_loadl_pd (__m128d __A, double const *__B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd (__m128d __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v2du) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1224,6 +1229,7 @@ _mm_movemask_pd (__m128d __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1434,6 +1440,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B) return ((__m64)a * (__m64)b); } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32 (__m128i __A, __m128i __B) { @@ -1460,6 +1467,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); #endif } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) @@ -1749,7 +1757,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B) lshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (lshift, shmax); result = vec_sl ((__v2du) __A, lshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1843,7 +1851,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B) rshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (rshift, shmax); result = vec_sr ((__v2du) __A, rshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1995,10 +2003,14 @@ _mm_min_epu8 (__m128i __A, __m128i __B) #ifdef _ARCH_PWR8 /* Intrinsic functions that require PowerISA 2.07 minimum. */ -/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +/* Return a mask created from the most significant bit of each 8-bit + element in A. */ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8 (__m128i __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v16qu) __A); +#else __vector unsigned long long result; static const __vector unsigned char perm_mask = { @@ -2015,6 +2027,7 @@ _mm_movemask_epi8 (__m128i __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -2158,27 +2171,37 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti _mm_sad_epu8 (__m128i __A, __m128i __B) { __v16qu a, b; - __v16qu vmin, vmax, vabsdiff; + __v16qu vabsdiff; __v4si vsum; const __v4su zero = { 0, 0, 0, 0 }; __v4si result; a = (__v16qu) __A; b = (__v16qu) __B; - vmin = vec_min (a, b); - vmax = vec_max (a, b); +#ifndef _ARCH_PWR9 + __v16qu vmin = vec_min (a, b); + __v16qu vmax = vec_max (a, b); vabsdiff = vec_sub (vmax, vmin); +#else + vabsdiff = vec_absd (a, b); +#endif /* Sum four groups of bytes into integers. */ vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); +#ifdef __LITTLE_ENDIAN__ + /* Sum across four integers with two integer results. */ + asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero)); + /* Note: vec_sum2s could be used here, but on little-endian, vector + shifts are added that are not needed for this use-case. + A vector shift to correctly position the 32-bit integer results + (currently at [0] and [2]) to [1] and [3] would then need to be + swapped back again since the desired results are two 64-bit + integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ +#else /* Sum across four integers with two integer results. */ result = vec_sum2s (vsum, (__vector signed int) zero); /* Rotate the sums into the correct position. */ -#ifdef __LITTLE_ENDIAN__ - result = vec_sld (result, result, 4); -#else result = vec_sld (result, result, 6); #endif - /* Rotate the sums into the correct position. */ return (__m128i) result; } diff --git a/clang/lib/Headers/ppc_wrappers/immintrin.h b/clang/lib/Headers/ppc_wrappers/immintrin.h new file mode 100644 index 0000000000000..c1ada9889d4af --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/immintrin.h @@ -0,0 +1,27 @@ +/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef IMMINTRIN_H_ +#define IMMINTRIN_H_ + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#endif /* IMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/nmmintrin.h b/clang/lib/Headers/ppc_wrappers/nmmintrin.h new file mode 100644 index 0000000000000..789bba6bc0d33 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/nmmintrin.h @@ -0,0 +1,26 @@ +/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef NMMINTRIN_H_ +#define NMMINTRIN_H_ + +/* We just include SSE4.1 header file. */ +#include + +#endif /* NMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/clang/lib/Headers/ppc_wrappers/pmmintrin.h index 8d4046bd43f1b..d8afdf6295507 100644 --- a/clang/lib/Headers/ppc_wrappers/pmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h @@ -111,17 +111,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y) vec_mergel ((__v2df) __X, (__v2df)__Y)); } +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movehdup_ps (__m128 __X) { return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_moveldup_ps (__m128 __X) { return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); } +#endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loaddup_pd (double const *__P) diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h index 674703245a699..9a69114f902c9 100644 --- a/clang/lib/Headers/ppc_wrappers/smmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -34,77 +34,683 @@ #include #include -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi8(__m128i __X, const int __N) { - return (unsigned char)((__v16qi)__X)[__N & 15]; +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_ZERO 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_NEG_INF 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __A, int __rounding) +{ + __v2df __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v2df) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v2df) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v2df) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v2df) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v2df) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128d) __r; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi32(__m128i __X, const int __N) { +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd (__m128d __A, __m128d __B, int __rounding) +{ + __B = _mm_round_pd (__B, __rounding); + __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] }; + return (__m128d) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __A, int __rounding) +{ + __v4sf __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v4sf) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v4sf) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v4sf) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v4sf) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v4sf) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __A, __m128 __B, int __rounding) +{ + __B = _mm_round_ps (__B, __rounding); + __v4sf __r = (__v4sf) __A; + __r[0] = ((__v4sf) __B)[0]; + return (__m128) __r; +} + +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N) +{ + __v16qi result = (__v16qi)__A; + + result [__N & 0xf] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i const __A, int const __D, int const __N) +{ + __v4si result = (__v4si)__A; + + result [__N & 3] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N) +{ + __v2di result = (__v2di)__A; + + result [__N & 1] = __D; + + return (__m128i) result; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) ((__v16qi)__X)[__N & 15]; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi64(__m128i __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ return ((__v2di)__X)[__N & 1]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_ps(__m128 __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8) +{ + __v16qi __charmask = vec_splats ((signed char) __imm8); + __charmask = vec_gb (__charmask); + __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask); + #ifdef __BIG_ENDIAN__ + __shortmask = vec_reve (__shortmask); + #endif + return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask); +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask); +#else + const __v16qu __seven = vec_splats ((unsigned char) 0x07); + __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven); + return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask); +#endif +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask); +#else + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); +#endif +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d) __r; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask); +#else + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); +#endif +} +#endif + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A); + return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0; +} + +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +#ifdef _ARCH_PWR8 extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { - __v16qi __charmask = vec_splats((signed char)__imm8); - __charmask = vec_gb(__charmask); - __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); -#ifdef __BIG_ENDIAN__ - __shortmask = vec_reve(__shortmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y); +} #endif - return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { - const __v16qu __seven = vec_splats((unsigned char)0x07); - __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); - return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { - __v16qi result = (__v16qi)__A; - result[__N & 0xf] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { - __v4si result = (__v4si)__A; - result[__N & 3] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { - __v2di result = (__v2di)__A; - result[__N & 1] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v16qi) __A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v4si) __A); } +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __A) +{ + const __v4su __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v4su) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __A) +{ + union __u + { + __m128i __m; + __v8hu __uh; + }; + union __u __u = { .__m = __A }, __r = { .__m = {0} }; + unsigned short __ridx = 0; + unsigned short __rmin = __u.__uh[__ridx]; + for (unsigned long __i = 1; __i < 8; __i++) + { + if (__u.__uh[__i] < __rmin) + { + __rmin = __u.__uh[__i]; + __ridx = __i; + } + } + __r.__uh[0] = __rmin; + __r.__uh[1] = __ridx; + return __r.__m; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y); +} +#endif #else #include_next #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _SMMINTRIN_H_ */ +#endif /* SMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/clang/lib/Headers/ppc_wrappers/tmmintrin.h index ebef7b8192d75..6e4677d8a6eff 100644 --- a/clang/lib/Headers/ppc_wrappers/tmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h @@ -339,6 +339,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B) return (__m64) ((__v2du) (__C))[0]; } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi8 (__m128i __A, __m128i __B) @@ -350,7 +351,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B) __v16qi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi16 (__m128i __A, __m128i __B) @@ -362,7 +365,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B) __v8hi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi32 (__m128i __A, __m128i __B) @@ -374,7 +379,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B) __v4si __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi8 (__m64 __A, __m64 __B) @@ -385,7 +392,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B) __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi16 (__m64 __A, __m64 __B) @@ -396,7 +405,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B) __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi32 (__m64 __A, __m64 __B) @@ -407,6 +418,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B) __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/clang/lib/Headers/ppc_wrappers/x86gprintrin.h b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h new file mode 100644 index 0000000000000..cbfac262395c4 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h @@ -0,0 +1,17 @@ +/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef X86GPRINTRIN_H_ +#define X86GPRINTRIN_H_ + +#include + +#include + +#endif /* X86GPRINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/x86intrin.h b/clang/lib/Headers/ppc_wrappers/x86intrin.h new file mode 100644 index 0000000000000..f5c201262e69e --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/x86intrin.h @@ -0,0 +1,28 @@ +/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef X86INTRIN_H_ +#define X86INTRIN_H_ + +#ifdef __ALTIVEC__ +#include +#endif /* __ALTIVEC__ */ + +#endif /* X86INTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/clang/lib/Headers/ppc_wrappers/xmmintrin.h index 956603d364082..14b9cd1796d6e 100644 --- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h @@ -31,8 +31,8 @@ #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." #endif -#ifndef _XMMINTRIN_H_INCLUDED -#define _XMMINTRIN_H_INCLUDED +#ifndef XMMINTRIN_H_ +#define XMMINTRIN_H_ #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) @@ -881,7 +881,7 @@ _mm_cvtss_f32 (__m128 __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si32 (__m128 __A) { - __m64 res = 0; + int res; #ifdef _ARCH_PWR8 double dtmp; __asm__( @@ -914,8 +914,8 @@ _mm_cvt_ss2si (__m128 __A) extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si64 (__m128 __A) { - __m64 res = 0; -#ifdef _ARCH_PWR8 + long long res; +#if defined (_ARCH_PWR8) && defined (__powerpc64__) double dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ @@ -1328,6 +1328,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_ps (__m128 __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__vector unsigned int) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1347,6 +1350,7 @@ _mm_movemask_ps (__m128 __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1553,6 +1557,7 @@ _m_pminub (__m64 __A, __m64 __B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { +#ifdef __powerpc64__ unsigned long long p = #ifdef __LITTLE_ENDIAN__ 0x0008101820283038UL; // permute control for sign bits @@ -1560,6 +1565,18 @@ _mm_movemask_pi8 (__m64 __A) 0x3830282018100800UL; // permute control for sign bits #endif return __builtin_bpermd (p, __A); +#else +#ifdef __LITTLE_ENDIAN__ + unsigned int mask = 0x20283038UL; + unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; +#else + unsigned int mask = 0x38302820UL; + unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf; +#endif + return (r2 << 4) | r1; +#endif } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1841,4 +1858,4 @@ do { \ #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _XMMINTRIN_H_INCLUDED */ +#endif /* XMMINTRIN_H_ */ diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c index 6e4a2e4309d4a..33af7b91af788 100644 --- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c +++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c @@ -5,6 +5,9 @@ // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ // RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE + // CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> , align 16 // CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4 // CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2 @@ -440,7 +443,8 @@ test_converts() { // CHECK: call <2 x double> @vec_rint(double vector[2]) // CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16 // CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}}) -// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4]) // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer) // CHECK-LABEL: define available_externally i64 @_mm_cvtpd_pi32 @@ -450,7 +454,8 @@ test_converts() { // CHECK-LABEL: define available_externally <4 x float> @_mm_cvtpd_ps // CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16 // CHECK: call <4 x i32> asm "xvcvdpsp ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}}) -// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4]) // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer) // CHECK-LABEL: define available_externally <2 x double> @_mm_cvtpi32_pd @@ -530,7 +535,8 @@ test_converts() { // CHECK-LABEL: define available_externally <2 x i64> @_mm_cvttpd_epi32 // CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa" -// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4]) +// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4]) // CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer) // CHECK-LABEL: define available_externally i64 @_mm_cvttpd_pi32 @@ -756,12 +762,18 @@ test_move() { // CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0 // CHECK: insertelement <2 x double> %{{[0-9a-zA-Z_.]+}}, double %[[EXT]], i32 0 +// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_epi8 +// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) + // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_epi8 // CHECK: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef ) // CHECK-LE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1 // CHECK-BE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0 // CHECK: trunc i64 %[[VAL]] to i32 +// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_pd +// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) + // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_pd // CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> to <16 x i8>)) // CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1 @@ -857,9 +869,9 @@ test_sad() { // CHECK: call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16]) // CHECK: call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16]) // CHECK: call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) -// CHECK: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) -// CHECK-LE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 4) -// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 6) +// CHECK-LE: call <4 x i32> asm "vsum2sws $0,$1,$2", "=v,v,v"(<4 x i32> %11, <4 x i32> zeroinitializer) +// CHECK-BE: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) +// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int) void __attribute__((noinline)) test_set() { @@ -1086,7 +1098,7 @@ test_sll() { // CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef zeroext 0) // CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, <2 x i64> noundef ) // CHECK: call <2 x i64> @vec_sl(unsigned long long vector[2], unsigned long long vector[2]) -// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2]) +// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2]) // CHECK-LABEL: define available_externally <2 x i64> @_mm_slli_epi16 // CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16 @@ -1232,7 +1244,7 @@ test_srl() { // CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 0) // CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef ) // CHECK: call <2 x i64> @vec_sr(unsigned long long vector[2], unsigned long long vector[2]) -// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2]) +// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2]) // CHECK-LABEL: define available_externally <2 x i64> @_mm_srli_epi16 // CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16 diff --git a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c index 5b2027dd52197..eaf96a71afca9 100644 --- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c +++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c @@ -10,6 +10,11 @@ // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ // RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10 +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10 + #include __m128 mn1, mn2; @@ -48,6 +53,10 @@ void __attribute__((noinline)) test_blend() { _mm_blend_epi16(m1, m2, 0); _mm_blendv_epi8(m1, m2, mi); + _mm_blend_ps(mn1, mn2, 0); + _mm_blendv_ps(mn1, mn2, mn1); + _mm_blend_pd(md1, md2, 0); + _mm_blendv_pd(md1, md2, md1); } // CHECK-LABEL: @test_blend @@ -62,10 +71,13 @@ test_blend() { // BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16 // CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8]) +// P10-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// P10: call <16 x i8> @vec_blendv(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) + // CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) // CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7) // CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16]) -// CHECK: call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16]) void __attribute__((noinline)) test_insert() { @@ -89,6 +101,239 @@ test_insert() { // CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1 // CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]] +void __attribute__((noinline)) +test_convert() { + _mm_cvtepi16_epi32(m1); + _mm_cvtepi16_epi64(m1); + _mm_cvtepi32_epi64(m1); + _mm_cvtepi8_epi16(m1); + _mm_cvtepi8_epi32(m1); + _mm_cvtepi8_epi64(m1); + _mm_cvtepu16_epi32(m1); + _mm_cvtepu16_epi64(m1); + _mm_cvtepu32_epi64(m1); + _mm_cvtepu8_epi16(m1); + _mm_cvtepu8_epi32(m1); + _mm_cvtepu8_epi64(m1); +} + +// CHECK-LABEL: @test_convert + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16]) +// CHECK: call <4 x i32> @vec_unpackh(short vector[8]) +// CHECK: call <2 x i64> @vec_unpackh(int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) +// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer) +// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer) +// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer) +// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer) +// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}) +// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8]) +// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4]) + +void __attribute__((noinline)) +test_minmax() { + _mm_max_epi32(m1, m2); + _mm_max_epi8(m1, m2); + _mm_max_epu16(m1, m2); + _mm_max_epu32(m1, m2); + _mm_min_epi32(m1, m2); + _mm_min_epi8(m1, m2); + _mm_min_epu16(m1, m2); + _mm_min_epu32(m1, m2); + _mm_minpos_epu16(m1); +} + +// CHECK-LABEL: @test_minmax + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false) +// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}} +// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: zext i16 %[[VEXT]] to i32 +// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32 +// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}} +// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1 + +void __attribute__((noinline)) +test_round() { + _mm_round_ps(mn1, 0); + _mm_round_ss(mn1, mn2, 0); + _mm_round_pd(mn1, 0); + _mm_round_sd(mn1, mn2, 0); +} + +// CHECK-LABEL: @test_round + +// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0" +// CHECK: call <4 x float> @vec_rint(float vector[4]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x float> @vec_floor(float vector[4]) +// CHECK: call <4 x float> @vec_ceil(float vector[4]) +// CHECK: call <4 x float> @vec_trunc(float vector[4]) +// CHECK: call <4 x float> @vec_rint(float vector[4]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0 + +// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0) +// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0" +// CHECK: call <2 x double> @vec_rint(double vector[2]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x double> @vec_floor(double vector[2]) +// CHECK: call <2 x double> @vec_ceil(double vector[2]) +// CHECK: call <2 x double> @vec_trunc(double vector[2]) +// CHECK: call <2 x double> @vec_rint(double vector[2]) +// CHECK: call void asm sideeffect "", "^wa" +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)() +// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}}) + +// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}}) +// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0 +// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1 + +void __attribute__((noinline)) +test_testing() { + _mm_testc_si128(m1, m2); + _mm_testnzc_si128(m1, m2); + _mm_testz_si128(m1, m2); +} + +// CHECK-LABEL: @test_testing + +// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer) + +// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32 + +// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16]) +// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer) + +void __attribute__((noinline)) +test_compare() { + _mm_cmpeq_epi64(m1, m2); + _mm_cmpgt_epi64(m1, m2); +} + +// CHECK-LABEL: @test_compare + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2]) + +void __attribute__((noinline)) +test_mul() { + _mm_mul_epi32(m1, m2); + _mm_mullo_epi32(m1, m2); +} + +// CHECK-LABEL: @test_mul + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4]) + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4]) + +void __attribute__((noinline)) +test_packus() { + _mm_packus_epi32(m1, m2); +} + +// CHECK-LABEL: @test_packus + +// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}) +// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3) + // To test smmintrin.h includes tmmintrin.h void __attribute__((noinline)) diff --git a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c new file mode 100644 index 0000000000000..fb5b9e82be45c --- /dev/null +++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c @@ -0,0 +1,239 @@ +// REQUIRES: powerpc-registered-target +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s + +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s +// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s + +#include + +unsigned short us; +unsigned ui; +unsigned long long ul; + +void __attribute__((noinline)) +test_bmiintrin() { + __tzcnt_u16(us); + __andn_u32(ui, ui); + _bextr_u32(ui, ui, ui); + __bextr_u32(ui, ui); + __blsi_u32(ui); + _blsi_u32(ui); + __blsmsk_u32(ui); + _blsmsk_u32(ui); + __blsr_u32(ui); + _blsr_u32(ui); + __tzcnt_u32(ui); + _tzcnt_u32(ui); + __andn_u64(ul, ul); + _bextr_u64(ul, ui, ui); + __bextr_u64(ul, ul); + __blsi_u64(ul); + _blsi_u64(ul); + __blsmsk_u64(ul); + _blsmsk_u64(ul); + __blsr_u64(ul); + _blsr_u64(ul); + __tzcnt_u64(ul); + _tzcnt_u64(ul); +} + +// CHECK-LABEL: @test_bmiintrin + +// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false) +// CHECK: trunc i32 %[[CALL]] to i16 + +// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1 +// CHECK: and i32 %[[NEG]], %1 + +// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]] +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i32 %[[SHL]], %[[SUB]]1 + +// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8 +// CHECK: and i32 %[[SHR]], 255 +// CHECK: call zeroext i32 @_bextr_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1 +// CHECK: and i32 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsi_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsmsk_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call zeroext i32 @__blsr_u32 + +// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false) + +// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false) + +// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1 +// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}} + +// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]] +// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64 +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64 +// CHECK: lshr i64 %[[SHL]], %[[EXT2]] + +// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255 +// CHECK: trunc i64 %[[AND]] to i32 +// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8 +// CHECK: trunc i64 %[[SHR]] to i32 +// CHECK: call i64 @_bextr_u64 + +// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}} +// CHECK: and i64 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsi_u64 + +// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: xor i64 %0, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsmsk_u64 + +// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]] + +// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @__blsr_u64 + +// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: sext i32 %[[CAST]] to i64 + +// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: sext i32 %[[CAST]] to i64 + +void __attribute__((noinline)) +test_bmi2intrin() { + _bzhi_u32(ui, ui); + _mulx_u32(ui, ui, &ui); + _bzhi_u64(ul, ul); + _mulx_u64(ul, ul, &ul); + _pdep_u64(ul, ul); + _pext_u64(ul, ul); + _pdep_u32(ui, ui); + _pext_u32(ui, ui); +} + +// CHECK-LABEL: @test_bmi2intrin + +// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i32 %[[SHL]], %[[SUB1]] + +// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32 +// CHECK: trunc i64 %[[SHR]] to i32 +// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32 + +// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]] +// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i64 %[[SHL]], %[[SUB1]] + +// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128 +// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128 +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64 +// CHECK: trunc i128 %[[SHR]] to i64 +// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64 + +// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32 +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]] +// CHECK: sext i32 %[[SUB]] to i64 +// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32 +// CHECK: sext i32 %[[CAST2]] to i64 +// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]] +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]] +// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]] +// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]] +// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1 + +// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]] +// CHECK: [[TRUECOND]]: +// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8 +// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}} +// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]] +// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1 +// CHECK: call i64 @llvm.ppc.bpermd +// CHECK: [[FALSECOND]]: +// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}}) +// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false) +// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]] +// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: lshr i64 %[[AND]], %[[SUB]] +// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}} +// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]] +// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}} +// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1 + +// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]]) +// CHECK: trunc i64 %[[CALL]] to i32 + +// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}) +// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64 +// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]]) +// CHECK: trunc i64 %[[CALL]] to i32 diff --git a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c index 1a331087efd9e..163bd7716ff04 100644 --- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c +++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c @@ -9,6 +9,9 @@ // RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ // RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns +// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \ +// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-P10 + // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \ // RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE // RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \ @@ -383,12 +386,12 @@ test_convert() { // CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0 // CHECK-LABEL: define available_externally signext i32 @_mm_cvtss_si32 -// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" -// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" -// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 0 -// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 1 -// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 2 -// CHECK: trunc i64 %{{[0-9a-zA-Z_.]+}} to i32 +// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" +// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" +// CHECK-P10: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" +// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 0 +// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 1 +// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 2 // CHECK-LABEL: define available_externally i64 @_mm_cvtss_si64 // CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0" @@ -681,9 +684,11 @@ test_move() { // CHECK-LABEL: define available_externally signext i32 @_mm_movemask_ps // CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> to <16 x i8>)) // CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1 +// CHECK-LE: trunc i64 %[[EXT]] to i32 // CHECK-BE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> to <16 x i8>)) // CHECK-BE: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0 -// CHECK: trunc i64 %[[EXT]] to i32 +// CHECK-BE: trunc i64 %[[EXT]] to i32 +// CHECK-P10: call zeroext i32 @vec_extractm(unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}) void __attribute__((noinline)) test_alt_name_move() {