[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions (#165682)

philnik777 · web-flow · commit 0dbedd195c94 · 2025-11-27T11:36:43.000Z
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
@@ -156,8 +156,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
@@ -170,8 +168,6 @@ let Features = "sse2", Attributes = [NoThrow] in {
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
-  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
   def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
@@ -513,8 +509,6 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
 }
 
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
-  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3539,14 +3533,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
   def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
 }
 
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
 let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
 }
@@ -5065,15 +5051,3 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
 }
-
-let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2171,21 +2171,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateBitCast(Res, Ops[0]->getType());
   }
 
-  case X86::BI__builtin_ia32_sqrtss:
-  case X86::BI__builtin_ia32_sqrtsd: {
-    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-    Function *F;
-    if (Builder.getIsFPConstrained()) {
-      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
-      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
-                           A->getType());
-      A = Builder.CreateConstrainedFPCall(F, {A});
-    } else {
-      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
-      A = Builder.CreateCall(F, {A});
-    }
-    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
-  }
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2225,40 +2210,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
   }
-  case X86::BI__builtin_ia32_sqrtpd256:
-  case X86::BI__builtin_ia32_sqrtpd:
-  case X86::BI__builtin_ia32_sqrtps256:
-  case X86::BI__builtin_ia32_sqrtps:
-  case X86::BI__builtin_ia32_sqrtph256:
-  case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
-  case X86::BI__builtin_ia32_vsqrtbf16256:
-  case X86::BI__builtin_ia32_vsqrtbf16:
-  case X86::BI__builtin_ia32_vsqrtbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
-    if (Ops.size() == 2) {
-      unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-      // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
-      // otherwise keep the intrinsic.
-      if (CC != 4) {
-        Intrinsic::ID IID;
-
-        switch (BuiltinID) {
-        default:
-          llvm_unreachable("Unsupported intrinsic!");
-        case X86::BI__builtin_ia32_sqrtph512:
-          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtps512:
-          IID = Intrinsic::x86_avx512_sqrt_ps_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtpd512:
-          IID = Intrinsic::x86_avx512_sqrt_pd_512;
-          break;
-        }
-        return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+    unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+    // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+    // otherwise keep the intrinsic.
+    if (CC != 4) {
+      Intrinsic::ID IID;
+
+      switch (BuiltinID) {
+      default:
+        llvm_unreachable("Unsupported intrinsic!");
+      case X86::BI__builtin_ia32_sqrtph512:
+        IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtps512:
+        IID = Intrinsic::x86_avx512_sqrt_ps_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtpd512:
+        IID = Intrinsic::x86_avx512_sqrt_pd_512;
+        break;
       }
+      return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
     }
     if (Builder.getIsFPConstrained()) {
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -429,7 +429,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
       (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
-  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -826,7 +826,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
       (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
-  return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -843,7 +843,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
-  return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -623,7 +623,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
                                             (__mmask16)(U)))
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
-  return __builtin_ia32_sqrtph((__v8hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -640,7 +640,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
 }
 
 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
-  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
@@ -333,10 +333,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
 ///    A 256-bit vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
 ///    values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
-  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the square roots of the values in a 256-bit vector of
@@ -350,10 +348,8 @@ _mm256_sqrt_pd(__m256d __a)
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
 ///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
@@ -241,8 +241,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
 ///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return __extension__(__m128d){__c[0], __a[1]};
+  return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
 }
 
 /// Calculates the square root of the each of two values stored in a
@@ -257,7 +256,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
 ///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
-  return __builtin_ia32_sqrtpd((__v2df)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
@@ -231,10 +231,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
 ///    used in the calculation.
 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 ///    value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+  __a[0] = __builtin_elementwise_sqrt(__a[0]);
+  return __a;
 }
 
 /// Calculates the square roots of the values stored in a 128-bit vector
@@ -248,10 +247,8 @@ _mm_sqrt_ss(__m128 __a)
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 ///    values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the approximate reciprocal of the value stored in the
diff --git a/clang/test/CodeGen/X86/sse-builtins-constrained.c b/clang/test/CodeGen/X86/sse-builtins-constrained.c
@@ -28,11 +28,10 @@ __m128 test_mm_sqrt_ps(__m128 x) {
 
 __m128 test_sqrt_ss(__m128 x) {
   // COMMON-LABEL: test_sqrt_ss
-  // COMMONIR: extractelement <4 x float> {{.*}}, i64 0
+  // COMMONIR: extractelement <4 x float> {{.*}}, i32 0
   // UNCONSTRAINED: call float @llvm.sqrt.f32(float {{.*}})
   // CONSTRAINED: call float @llvm.experimental.constrained.sqrt.f32(float {{.*}}, metadata !{{.*}})
   // CHECK-ASM: sqrtss
-  // COMMONIR: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
+  // COMMONIR: insertelement <4 x float> {{.*}}, float {{.*}}, i32 0
   return _mm_sqrt_ss(x);
 }
-
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
@@ -751,9 +751,9 @@ __m128 test_mm_sqrt_ps(__m128 x) {
 
 __m128 test_mm_sqrt_ss(__m128 x) {
   // CHECK-LABEL: test_mm_sqrt_ss
-  // CHECK: extractelement <4 x float> {{.*}}, i64 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
   // CHECK: call float @llvm.sqrt.f32(float {{.*}})
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 0
   return _mm_sqrt_ss(x);
 }
 
diff --git a/clang/test/CodeGen/X86/sse2-builtins-constrained.c b/clang/test/CodeGen/X86/sse2-builtins-constrained.c
@@ -28,11 +28,10 @@ __m128d test_mm_sqrt_pd(__m128d x) {
 
 __m128d test_sqrt_sd(__m128d x, __m128d y) {
   // COMMON-LABEL: test_sqrt_sd
-  // COMMONIR: extractelement <2 x double> {{.*}}, i64 0
+  // COMMONIR: extractelement <2 x double> {{.*}}, i32 0
   // UNCONSTRAINED: call double @llvm.sqrt.f64(double {{.*}})
   // CONSTRAINED: call double @llvm.experimental.constrained.sqrt.f64(double {{.*}}, metadata !{{.*}})
   // CHECK-ASM: sqrtsd
-  // COMMONIR: insertelement <2 x double> {{.*}}, double {{.*}}, i64 0
+  // COMMONIR: insertelement <2 x double> {{.*}}, double {{.*}}, i32 0
   return _mm_sqrt_sd(x, y);
 }
-
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1440,9 +1440,10 @@ __m128d test_mm_sqrt_pd(__m128d A) {
 
 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sqrt_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: call double @llvm.sqrt.f64(double {{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
+  // CHECK: %[[sqrt_vec:.*]] = insertelement <2 x double> poison, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %[[sqrt_vec]], double %{{.*}}, i32 1
   return _mm_sqrt_sd(A, B);
 }
 
diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c
@@ -282,8 +282,6 @@ void f0(void) {
   tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
   tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
   tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f);
-  tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
-  tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
   (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
   tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
   tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
@@ -292,8 +290,6 @@ void f0(void) {
   (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
 #endif
   tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
-  tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
-  tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
   tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
   tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
@@ -400,8 +396,6 @@ void f0(void) {
   tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7);
   tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7);
   tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7);
-  tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d);
-  tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f);
   tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f);
   tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
   tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);

Original file line number	Diff line number	Diff line change
`@@ -429,7 +429,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {`
`429`	`429`	`(__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))`
`430`	`430`
`431`	`431`	`static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {`
`432`		`- return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);`
	`432`	`+ return __builtin_elementwise_sqrt(__A);`
`433`	`433`	`}`
`434`	`434`
`435`	`435`	`static __inline__ __m512bh __DEFAULT_FN_ATTRS512`
Original file line number	Diff line number	Diff line change
`@@ -826,7 +826,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {`
`826`	`826`	`(__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))`
`827`	`827`
`828`	`828`	`static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {`
`829`		`- return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);`
	`829`	`+ return __builtin_elementwise_sqrt(__A);`
`830`	`830`	`}`
`831`	`831`
`832`	`832`	`static __inline__ __m256bh __DEFAULT_FN_ATTRS256`
`@@ -843,7 +843,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {`
`843`	`843`	`}`
`844`	`844`
`845`	`845`	`static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {`
`846`		`- return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);`
	`846`	`+ return __builtin_elementwise_sqrt(__A);`
`847`	`847`	`}`
`848`	`848`
`849`	`849`	`static __inline__ __m128bh __DEFAULT_FN_ATTRS128`
Original file line number	Diff line number	Diff line change
`@@ -623,7 +623,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {`
`623`	`623`	`(__mmask16)(U)))`
`624`	`624`
`625`	`625`	`static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {`
`626`		`- return __builtin_ia32_sqrtph((__v8hf)__a);`
	`626`	`+ return __builtin_elementwise_sqrt(__a);`
`627`	`627`	`}`
`628`	`628`
`629`	`629`	`static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,`
`@@ -640,7 +640,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,`
`640`	`640`	`}`
`641`	`641`
`642`	`642`	`static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {`
`643`		`- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);`
	`643`	`+ return __builtin_elementwise_sqrt(__a);`
`644`	`644`	`}`
`645`	`645`
`646`	`646`	`static __inline__ __m256h __DEFAULT_FN_ATTRS256`