-
Notifications
You must be signed in to change notification settings - Fork 15k
[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions #165682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Nikolas Klauser (philnik777) ChangesFull diff: https://github.com/llvm/llvm-project/pull/165682.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
- def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
- def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
- def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
- def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
}
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
}
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(Res, Ops[0]->getType());
}
- case X86::BI__builtin_ia32_sqrtss:
- case X86::BI__builtin_ia32_sqrtsd: {
- Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
- Function *F;
- if (Builder.getIsFPConstrained()) {
- CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
- F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
- A->getType());
- A = Builder.CreateConstrainedFPCall(F, {A});
- } else {
- F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
- A = Builder.CreateCall(F, {A});
- }
- return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
- }
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
- case X86::BI__builtin_ia32_sqrtpd256:
- case X86::BI__builtin_ia32_sqrtpd:
- case X86::BI__builtin_ia32_sqrtps256:
- case X86::BI__builtin_ia32_sqrtps:
- case X86::BI__builtin_ia32_sqrtph256:
- case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
- case X86::BI__builtin_ia32_vsqrtbf16256:
- case X86::BI__builtin_ia32_vsqrtbf16:
- case X86::BI__builtin_ia32_vsqrtbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
- if (Ops.size() == 2) {
- unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
- // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
- // otherwise keep the intrinsic.
- if (CC != 4) {
- Intrinsic::ID IID;
-
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_sqrtph512:
- IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
- break;
- case X86::BI__builtin_ia32_sqrtps512:
- IID = Intrinsic::x86_avx512_sqrt_ps_512;
- break;
- case X86::BI__builtin_ia32_sqrtpd512:
- IID = Intrinsic::x86_avx512_sqrt_pd_512;
- break;
- }
- return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtph512:
+ IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtps512:
+ IID = Intrinsic::x86_avx512_sqrt_ps_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtpd512:
+ IID = Intrinsic::x86_avx512_sqrt_pd_512;
+ break;
}
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
- return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
- return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
- return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
(__mmask16)(U)))
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
- return __builtin_ia32_sqrtph((__v8hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
}
static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
/// A 256-bit vector of [4 x double].
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
/// values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
- return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
__m128d __b) {
- __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
- return __extension__(__m128d){__c[0], __a[1]};
+ return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
}
/// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
- return __builtin_ia32_sqrtpd((__v2df)__a);
+ return __builtin_elementwise_sqrt(__a);
}
/// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
/// used in the calculation.
/// \returns A 128-bit vector of [4 x float] containing the square root of the
/// value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+ __a[0] = __builtin_elementwise_sqrt(__a[0]);
+ return __a;
}
/// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
/// values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
- return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the approximate reciprocal of the value stored in the
|
|
@llvm/pr-subscribers-clang-codegen Author: Nikolas Klauser (philnik777) ChangesFull diff: https://github.com/llvm/llvm-project/pull/165682.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
- def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
- def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
- def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
- def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
}
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
}
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(Res, Ops[0]->getType());
}
- case X86::BI__builtin_ia32_sqrtss:
- case X86::BI__builtin_ia32_sqrtsd: {
- Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
- Function *F;
- if (Builder.getIsFPConstrained()) {
- CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
- F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
- A->getType());
- A = Builder.CreateConstrainedFPCall(F, {A});
- } else {
- F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
- A = Builder.CreateCall(F, {A});
- }
- return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
- }
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
- case X86::BI__builtin_ia32_sqrtpd256:
- case X86::BI__builtin_ia32_sqrtpd:
- case X86::BI__builtin_ia32_sqrtps256:
- case X86::BI__builtin_ia32_sqrtps:
- case X86::BI__builtin_ia32_sqrtph256:
- case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
- case X86::BI__builtin_ia32_vsqrtbf16256:
- case X86::BI__builtin_ia32_vsqrtbf16:
- case X86::BI__builtin_ia32_vsqrtbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
- if (Ops.size() == 2) {
- unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
- // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
- // otherwise keep the intrinsic.
- if (CC != 4) {
- Intrinsic::ID IID;
-
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_sqrtph512:
- IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
- break;
- case X86::BI__builtin_ia32_sqrtps512:
- IID = Intrinsic::x86_avx512_sqrt_ps_512;
- break;
- case X86::BI__builtin_ia32_sqrtpd512:
- IID = Intrinsic::x86_avx512_sqrt_pd_512;
- break;
- }
- return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtph512:
+ IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtps512:
+ IID = Intrinsic::x86_avx512_sqrt_ps_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtpd512:
+ IID = Intrinsic::x86_avx512_sqrt_pd_512;
+ break;
}
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
- return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
- return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
- return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
(__mmask16)(U)))
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
- return __builtin_ia32_sqrtph((__v8hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
}
static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
/// A 256-bit vector of [4 x double].
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
/// values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
- return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
__m128d __b) {
- __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
- return __extension__(__m128d){__c[0], __a[1]};
+ return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
}
/// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
- return __builtin_ia32_sqrtpd((__v2df)__a);
+ return __builtin_elementwise_sqrt(__a);
}
/// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
/// used in the calculation.
/// \returns A 128-bit vector of [4 x float] containing the square root of the
/// value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+ __a[0] = __builtin_elementwise_sqrt(__a[0]);
+ return __a;
}
/// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
/// values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
- return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the approximate reciprocal of the value stored in the
|
|
@llvm/pr-subscribers-clang Author: Nikolas Klauser (philnik777) ChangesFull diff: https://github.com/llvm/llvm-project/pull/165682.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
- def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
- def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
- def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
- def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
- def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
}
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
}
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
- def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(Res, Ops[0]->getType());
}
- case X86::BI__builtin_ia32_sqrtss:
- case X86::BI__builtin_ia32_sqrtsd: {
- Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
- Function *F;
- if (Builder.getIsFPConstrained()) {
- CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
- F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
- A->getType());
- A = Builder.CreateConstrainedFPCall(F, {A});
- } else {
- F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
- A = Builder.CreateCall(F, {A});
- }
- return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
- }
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
- case X86::BI__builtin_ia32_sqrtpd256:
- case X86::BI__builtin_ia32_sqrtpd:
- case X86::BI__builtin_ia32_sqrtps256:
- case X86::BI__builtin_ia32_sqrtps:
- case X86::BI__builtin_ia32_sqrtph256:
- case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
- case X86::BI__builtin_ia32_vsqrtbf16256:
- case X86::BI__builtin_ia32_vsqrtbf16:
- case X86::BI__builtin_ia32_vsqrtbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
- if (Ops.size() == 2) {
- unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
- // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
- // otherwise keep the intrinsic.
- if (CC != 4) {
- Intrinsic::ID IID;
-
- switch (BuiltinID) {
- default:
- llvm_unreachable("Unsupported intrinsic!");
- case X86::BI__builtin_ia32_sqrtph512:
- IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
- break;
- case X86::BI__builtin_ia32_sqrtps512:
- IID = Intrinsic::x86_avx512_sqrt_ps_512;
- break;
- case X86::BI__builtin_ia32_sqrtpd512:
- IID = Intrinsic::x86_avx512_sqrt_pd_512;
- break;
- }
- return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+ unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+ // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+ // otherwise keep the intrinsic.
+ if (CC != 4) {
+ Intrinsic::ID IID;
+
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unsupported intrinsic!");
+ case X86::BI__builtin_ia32_sqrtph512:
+ IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtps512:
+ IID = Intrinsic::x86_avx512_sqrt_ps_512;
+ break;
+ case X86::BI__builtin_ia32_sqrtpd512:
+ IID = Intrinsic::x86_avx512_sqrt_pd_512;
+ break;
}
+ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
- return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
- return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
- return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+ return __builtin_elementwise_sqrt(__A);
}
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
(__mmask16)(U)))
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
- return __builtin_ia32_sqrtph((__v8hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
}
static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+ return __builtin_elementwise_sqrt(__a);
}
static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
/// A 256-bit vector of [4 x double].
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
/// values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
- return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
__m128d __b) {
- __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
- return __extension__(__m128d){__c[0], __a[1]};
+ return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
}
/// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
- return __builtin_ia32_sqrtpd((__v2df)__a);
+ return __builtin_elementwise_sqrt(__a);
}
/// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
/// used in the calculation.
/// \returns A 128-bit vector of [4 x float] containing the square root of the
/// value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+ __a[0] = __builtin_elementwise_sqrt(__a[0]);
+ return __a;
}
/// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
/// values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
- return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+ return __builtin_elementwise_sqrt(__a);
}
/// Calculates the approximate reciprocal of the value stored in the
|
| @@ -241,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a, | |||
| /// bits are copied from the upper 64 bits of operand \a __a. | |||
| static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, | |||
| __m128d __b) { | |||
| __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); | |||
| return __extension__(__m128d){__c[0], __a[1]}; | |||
| return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]}; | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not correct. We need to consider the constrained FP case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you talking about the Builder.getIsFPConstrained() branch? AFAICT that's handled by __builtin_elementwise_sqrt, since that uses emitUnaryMaybeConstrainedFPBuiltin.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do have some sse/avx constrained tests - but I'm not certain all these sqrt intrinsics are covered
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good to know, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
No description provided.