Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}

Expand All @@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
Expand Down Expand Up @@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
Expand Down Expand Up @@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
}

let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
}

let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
}

let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
}
Expand Down Expand Up @@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
}

let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
}

let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
}

let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
}
64 changes: 19 additions & 45 deletions clang/lib/CodeGen/TargetBuiltins/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return Builder.CreateBitCast(Res, Ops[0]->getType());
}

case X86::BI__builtin_ia32_sqrtss:
case X86::BI__builtin_ia32_sqrtsd: {
Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
Function *F;
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
A->getType());
A = Builder.CreateConstrainedFPCall(F, {A});
} else {
F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
A = Builder.CreateCall(F, {A});
}
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask: {
Expand Down Expand Up @@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
}
case X86::BI__builtin_ia32_sqrtpd256:
case X86::BI__builtin_ia32_sqrtpd:
case X86::BI__builtin_ia32_sqrtps256:
case X86::BI__builtin_ia32_sqrtps:
case X86::BI__builtin_ia32_sqrtph256:
case X86::BI__builtin_ia32_sqrtph:
case X86::BI__builtin_ia32_sqrtph512:
case X86::BI__builtin_ia32_vsqrtbf16256:
case X86::BI__builtin_ia32_vsqrtbf16:
case X86::BI__builtin_ia32_vsqrtbf16512:
case X86::BI__builtin_ia32_sqrtps512:
case X86::BI__builtin_ia32_sqrtpd512: {
if (Ops.size() == 2) {
unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
// otherwise keep the intrinsic.
if (CC != 4) {
Intrinsic::ID IID;

switch (BuiltinID) {
default:
llvm_unreachable("Unsupported intrinsic!");
case X86::BI__builtin_ia32_sqrtph512:
IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
break;
case X86::BI__builtin_ia32_sqrtps512:
IID = Intrinsic::x86_avx512_sqrt_ps_512;
break;
case X86::BI__builtin_ia32_sqrtpd512:
IID = Intrinsic::x86_avx512_sqrt_pd_512;
break;
}
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
// otherwise keep the intrinsic.
if (CC != 4) {
Intrinsic::ID IID;

switch (BuiltinID) {
default:
llvm_unreachable("Unsupported intrinsic!");
case X86::BI__builtin_ia32_sqrtph512:
IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
break;
case X86::BI__builtin_ia32_sqrtps512:
IID = Intrinsic::x86_avx512_sqrt_ps_512;
break;
case X86::BI__builtin_ia32_sqrtpd512:
IID = Intrinsic::x86_avx512_sqrt_pd_512;
break;
}
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
if (Builder.getIsFPConstrained()) {
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Headers/avx10_2_512bf16intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))

static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
return __builtin_elementwise_sqrt(__A);
}

static __inline__ __m512bh __DEFAULT_FN_ATTRS512
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Headers/avx10_2bf16intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))

static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
return __builtin_elementwise_sqrt(__A);
}

static __inline__ __m256bh __DEFAULT_FN_ATTRS256
Expand All @@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
}

static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
return __builtin_elementwise_sqrt(__A);
}

static __inline__ __m128bh __DEFAULT_FN_ATTRS128
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Headers/avx512vlfp16intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
(__mmask16)(U)))

static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
return __builtin_ia32_sqrtph((__v8hf)__a);
return __builtin_elementwise_sqrt(__a);
}

static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
Expand All @@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
}

static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
return __builtin_elementwise_sqrt(__a);
}

static __inline__ __m256h __DEFAULT_FN_ATTRS256
Expand Down
12 changes: 4 additions & 8 deletions clang/lib/Headers/avxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
/// A 256-bit vector of [4 x double].
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
/// values in the operand.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_sqrt_pd(__m256d __a)
{
return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
return __builtin_elementwise_sqrt(__a);
}

/// Calculates the square roots of the values in a 256-bit vector of
Expand All @@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
/// values in the operand.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_sqrt_ps(__m256 __a)
{
return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
return __builtin_elementwise_sqrt(__a);
}

/// Calculates the reciprocal square roots of the values in a 256-bit
Expand Down
5 changes: 2 additions & 3 deletions clang/lib/Headers/emmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
/// bits are copied from the upper 64 bits of operand \a __a.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
__m128d __b) {
__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
return __extension__(__m128d){__c[0], __a[1]};
return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct. We need to consider the constrained FP case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you talking about the Builder.getIsFPConstrained() branch? AFAICT that's handled by __builtin_elementwise_sqrt, since that uses emitUnaryMaybeConstrainedFPBuiltin.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have some sse/avx constrained tests - but I'm not certain all these sqrt intrinsics are covered

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to know, thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@RKSimon Are you requesting additional test coverage?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes please, we need at least sse2-builtins-constrained.c, avx-builtins-constrained.c and avx512vlfp16-builtins-constrained.c test files similar to sse-builtins-constrained.c - unless you can find other test files that already check these intrinsic

}

/// Calculates the square root of the each of two values stored in a
Expand All @@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
/// values in the operand.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
return __builtin_ia32_sqrtpd((__v2df)__a);
return __builtin_elementwise_sqrt(__a);
}

/// Compares lower 64-bit double-precision values of both operands, and
Expand Down
13 changes: 5 additions & 8 deletions clang/lib/Headers/xmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
/// used in the calculation.
/// \returns A 128-bit vector of [4 x float] containing the square root of the
/// value in the low-order bits of the operand.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)
{
return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
__a[0] = __builtin_elementwise_sqrt(__a[0]);
return __a;
}

/// Calculates the square roots of the values stored in a 128-bit vector
Expand All @@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
/// values in the operand.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)
{
return __builtin_ia32_sqrtps((__v4sf)__a);
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
return __builtin_elementwise_sqrt(__a);
}

/// Calculates the approximate reciprocal of the value stored in the
Expand Down