Skip to content

Commit 0dbedd1

Browse files
authored
[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions (#165682)
1 parent 8401a8d commit 0dbedd1

File tree

13 files changed

+44
-111
lines changed

13 files changed

+44
-111
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
156156
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
157157
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
158158
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
159-
def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
160-
def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
161159
}
162160

163161
let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
@@ -170,8 +168,6 @@ let Features = "sse2", Attributes = [NoThrow] in {
170168

171169
let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
172170
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
173-
def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
174-
def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
175171
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
176172
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
177173
def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
@@ -513,8 +509,6 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
513509
}
514510

515511
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
516-
def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
517-
def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
518512
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
519513
def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
520514
def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3539,14 +3533,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
35393533
def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
35403534
}
35413535

3542-
let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
3543-
def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
3544-
}
3545-
3546-
let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
3547-
def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
3548-
}
3549-
35503536
let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
35513537
def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
35523538
}
@@ -5065,15 +5051,3 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
50655051
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
50665052
def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
50675053
}
5068-
5069-
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
5070-
def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
5071-
}
5072-
5073-
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
5074-
def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
5075-
}
5076-
5077-
let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
5078-
def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
5079-
}

clang/lib/CodeGen/TargetBuiltins/X86.cpp

Lines changed: 19 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2171,21 +2171,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
21712171
return Builder.CreateBitCast(Res, Ops[0]->getType());
21722172
}
21732173

2174-
case X86::BI__builtin_ia32_sqrtss:
2175-
case X86::BI__builtin_ia32_sqrtsd: {
2176-
Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
2177-
Function *F;
2178-
if (Builder.getIsFPConstrained()) {
2179-
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2180-
F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2181-
A->getType());
2182-
A = Builder.CreateConstrainedFPCall(F, {A});
2183-
} else {
2184-
F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2185-
A = Builder.CreateCall(F, {A});
2186-
}
2187-
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
2188-
}
21892174
case X86::BI__builtin_ia32_sqrtsh_round_mask:
21902175
case X86::BI__builtin_ia32_sqrtsd_round_mask:
21912176
case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2225,40 +2210,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
22252210
A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
22262211
return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
22272212
}
2228-
case X86::BI__builtin_ia32_sqrtpd256:
2229-
case X86::BI__builtin_ia32_sqrtpd:
2230-
case X86::BI__builtin_ia32_sqrtps256:
2231-
case X86::BI__builtin_ia32_sqrtps:
2232-
case X86::BI__builtin_ia32_sqrtph256:
2233-
case X86::BI__builtin_ia32_sqrtph:
22342213
case X86::BI__builtin_ia32_sqrtph512:
2235-
case X86::BI__builtin_ia32_vsqrtbf16256:
2236-
case X86::BI__builtin_ia32_vsqrtbf16:
2237-
case X86::BI__builtin_ia32_vsqrtbf16512:
22382214
case X86::BI__builtin_ia32_sqrtps512:
22392215
case X86::BI__builtin_ia32_sqrtpd512: {
2240-
if (Ops.size() == 2) {
2241-
unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
2242-
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2243-
// otherwise keep the intrinsic.
2244-
if (CC != 4) {
2245-
Intrinsic::ID IID;
2246-
2247-
switch (BuiltinID) {
2248-
default:
2249-
llvm_unreachable("Unsupported intrinsic!");
2250-
case X86::BI__builtin_ia32_sqrtph512:
2251-
IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
2252-
break;
2253-
case X86::BI__builtin_ia32_sqrtps512:
2254-
IID = Intrinsic::x86_avx512_sqrt_ps_512;
2255-
break;
2256-
case X86::BI__builtin_ia32_sqrtpd512:
2257-
IID = Intrinsic::x86_avx512_sqrt_pd_512;
2258-
break;
2259-
}
2260-
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2216+
unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
2217+
// Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2218+
// otherwise keep the intrinsic.
2219+
if (CC != 4) {
2220+
Intrinsic::ID IID;
2221+
2222+
switch (BuiltinID) {
2223+
default:
2224+
llvm_unreachable("Unsupported intrinsic!");
2225+
case X86::BI__builtin_ia32_sqrtph512:
2226+
IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
2227+
break;
2228+
case X86::BI__builtin_ia32_sqrtps512:
2229+
IID = Intrinsic::x86_avx512_sqrt_ps_512;
2230+
break;
2231+
case X86::BI__builtin_ia32_sqrtpd512:
2232+
IID = Intrinsic::x86_avx512_sqrt_pd_512;
2233+
break;
22612234
}
2235+
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
22622236
}
22632237
if (Builder.getIsFPConstrained()) {
22642238
CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);

clang/lib/Headers/avx10_2_512bf16intrin.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
429429
(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
430430

431431
static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
432-
return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
432+
return __builtin_elementwise_sqrt(__A);
433433
}
434434

435435
static __inline__ __m512bh __DEFAULT_FN_ATTRS512

clang/lib/Headers/avx10_2bf16intrin.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
826826
(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
827827

828828
static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
829-
return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
829+
return __builtin_elementwise_sqrt(__A);
830830
}
831831

832832
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -843,7 +843,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
843843
}
844844

845845
static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
846-
return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
846+
return __builtin_elementwise_sqrt(__A);
847847
}
848848

849849
static __inline__ __m128bh __DEFAULT_FN_ATTRS128

clang/lib/Headers/avx512vlfp16intrin.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
623623
(__mmask16)(U)))
624624

625625
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
626-
return __builtin_ia32_sqrtph((__v8hf)__a);
626+
return __builtin_elementwise_sqrt(__a);
627627
}
628628

629629
static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -640,7 +640,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
640640
}
641641

642642
static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
643-
return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
643+
return __builtin_elementwise_sqrt(__a);
644644
}
645645

646646
static __inline__ __m256h __DEFAULT_FN_ATTRS256

clang/lib/Headers/avxintrin.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -333,10 +333,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
333333
/// A 256-bit vector of [4 x double].
334334
/// \returns A 256-bit vector of [4 x double] containing the square roots of the
335335
/// values in the operand.
336-
static __inline __m256d __DEFAULT_FN_ATTRS
337-
_mm256_sqrt_pd(__m256d __a)
338-
{
339-
return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
336+
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
337+
return __builtin_elementwise_sqrt(__a);
340338
}
341339

342340
/// Calculates the square roots of the values in a 256-bit vector of
@@ -350,10 +348,8 @@ _mm256_sqrt_pd(__m256d __a)
350348
/// A 256-bit vector of [8 x float].
351349
/// \returns A 256-bit vector of [8 x float] containing the square roots of the
352350
/// values in the operand.
353-
static __inline __m256 __DEFAULT_FN_ATTRS
354-
_mm256_sqrt_ps(__m256 __a)
355-
{
356-
return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
351+
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
352+
return __builtin_elementwise_sqrt(__a);
357353
}
358354

359355
/// Calculates the reciprocal square roots of the values in a 256-bit

clang/lib/Headers/emmintrin.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
241241
/// bits are copied from the upper 64 bits of operand \a __a.
242242
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
243243
__m128d __b) {
244-
__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
245-
return __extension__(__m128d){__c[0], __a[1]};
244+
return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
246245
}
247246

248247
/// Calculates the square root of the each of two values stored in a
@@ -257,7 +256,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
257256
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
258257
/// values in the operand.
259258
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
260-
return __builtin_ia32_sqrtpd((__v2df)__a);
259+
return __builtin_elementwise_sqrt(__a);
261260
}
262261

263262
/// Compares lower 64-bit double-precision values of both operands, and

clang/lib/Headers/xmmintrin.h

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
231231
/// used in the calculation.
232232
/// \returns A 128-bit vector of [4 x float] containing the square root of the
233233
/// value in the low-order bits of the operand.
234-
static __inline__ __m128 __DEFAULT_FN_ATTRS
235-
_mm_sqrt_ss(__m128 __a)
236-
{
237-
return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
234+
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
235+
__a[0] = __builtin_elementwise_sqrt(__a[0]);
236+
return __a;
238237
}
239238

240239
/// Calculates the square roots of the values stored in a 128-bit vector
@@ -248,10 +247,8 @@ _mm_sqrt_ss(__m128 __a)
248247
/// A 128-bit vector of [4 x float].
249248
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
250249
/// values in the operand.
251-
static __inline__ __m128 __DEFAULT_FN_ATTRS
252-
_mm_sqrt_ps(__m128 __a)
253-
{
254-
return __builtin_ia32_sqrtps((__v4sf)__a);
250+
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
251+
return __builtin_elementwise_sqrt(__a);
255252
}
256253

257254
/// Calculates the approximate reciprocal of the value stored in the

clang/test/CodeGen/X86/sse-builtins-constrained.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,10 @@ __m128 test_mm_sqrt_ps(__m128 x) {
2828

2929
__m128 test_sqrt_ss(__m128 x) {
3030
// COMMON-LABEL: test_sqrt_ss
31-
// COMMONIR: extractelement <4 x float> {{.*}}, i64 0
31+
// COMMONIR: extractelement <4 x float> {{.*}}, i32 0
3232
// UNCONSTRAINED: call float @llvm.sqrt.f32(float {{.*}})
3333
// CONSTRAINED: call float @llvm.experimental.constrained.sqrt.f32(float {{.*}}, metadata !{{.*}})
3434
// CHECK-ASM: sqrtss
35-
// COMMONIR: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
35+
// COMMONIR: insertelement <4 x float> {{.*}}, float {{.*}}, i32 0
3636
return _mm_sqrt_ss(x);
3737
}
38-

clang/test/CodeGen/X86/sse-builtins.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -751,9 +751,9 @@ __m128 test_mm_sqrt_ps(__m128 x) {
751751

752752
__m128 test_mm_sqrt_ss(__m128 x) {
753753
// CHECK-LABEL: test_mm_sqrt_ss
754-
// CHECK: extractelement <4 x float> {{.*}}, i64 0
754+
// CHECK: extractelement <4 x float> {{.*}}, i32 0
755755
// CHECK: call float @llvm.sqrt.f32(float {{.*}})
756-
// CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
756+
// CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 0
757757
return _mm_sqrt_ss(x);
758758
}
759759

0 commit comments

Comments
 (0)