-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[X86] Replace BF16 to F32 conversions with generic conversions #169781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Let standard casting / builtin_convertvector handle the conversions from BF16 to F32 My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen. First part of llvm#154911
|
@llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) ChangesLet standard casting / builtin_convertvector handle the conversions from BF16 to F32 My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen. First part of #154911 Full diff: https://github.com/llvm/llvm-project/pull/169781.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index fcc3957f9b8ab..f0991a1d82dfc 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3358,10 +3358,6 @@ let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<5
def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
}
-let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
- def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
-}
-
let Features = "avx512vp2intersect", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 00c8a1cf16e31..1b4ab08588602 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2796,8 +2796,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
- case X86::BI__builtin_ia32_cvtsbf162ss_32:
- return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 3973f0e389685..9a21d8680045a 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -36,7 +36,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
- return __builtin_ia32_cvtsbf162ss_32(__A);
+ return (float)(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -236,8 +236,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512) __builtin_convertvector(__A, __v16sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -252,8 +251,9 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtpbh_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -270,9 +270,8 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
- (__m512i)__S, (__mmask16)__U,
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
}
#undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 2d7ea0114d6a5..0e3184554a6c8 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -422,8 +422,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
- return _mm_castsi128_ps(
- (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m128)_mm256_castps256_ps128(
+ (__m256) __builtin_convertvector(__A, __v8sf));
}
/// Convert Packed BF16 Data to Packed float Data.
@@ -434,8 +434,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m256) __builtin_convertvector(__A, __v8sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -450,8 +449,8 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
- (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -466,8 +465,9 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtpbh_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -485,9 +485,8 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
- (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -505,9 +504,8 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
- (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m256)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index 3f544d387f7aa..dfaae4e459f7e 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -79,23 +79,20 @@ __m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U)
__m512 test_mm512_cvtpbh_ps(__m256bh A) {
// CHECK-LABEL: test_mm512_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
return _mm512_cvtpbh_ps(A);
}
__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_cvtpbh_ps(M, A);
}
__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_mask_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_cvtpbh_ps(S, M, A);
}
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index d59b254520774..80afcd7a490db 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -156,46 +156,43 @@ __bf16 test_mm_cvtness_sbh(float A) {
__m128 test_mm_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm_cvtpbh_ps(A);
}
__m256 test_mm256_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm256_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
return _mm256_cvtpbh_ps(A);
}
__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_maskz_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_cvtpbh_ps(M, A);
}
__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_cvtpbh_ps(M, A);
}
__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_mask_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_cvtpbh_ps(S, M, A);
}
__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_mask_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_cvtpbh_ps(S, M, A);
}
|
|
@llvm/pr-subscribers-clang-codegen Author: Simon Pilgrim (RKSimon) ChangesLet standard casting / builtin_convertvector handle the conversions from BF16 to F32 My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen. First part of #154911 Full diff: https://github.com/llvm/llvm-project/pull/169781.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index fcc3957f9b8ab..f0991a1d82dfc 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3358,10 +3358,6 @@ let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<5
def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
}
-let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
- def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
-}
-
let Features = "avx512vp2intersect", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 00c8a1cf16e31..1b4ab08588602 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2796,8 +2796,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
- case X86::BI__builtin_ia32_cvtsbf162ss_32:
- return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 3973f0e389685..9a21d8680045a 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -36,7 +36,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
- return __builtin_ia32_cvtsbf162ss_32(__A);
+ return (float)(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -236,8 +236,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512) __builtin_convertvector(__A, __v16sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -252,8 +251,9 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtpbh_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -270,9 +270,8 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
- (__m512i)__S, (__mmask16)__U,
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
}
#undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 2d7ea0114d6a5..0e3184554a6c8 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -422,8 +422,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
- return _mm_castsi128_ps(
- (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m128)_mm256_castps256_ps128(
+ (__m256) __builtin_convertvector(__A, __v8sf));
}
/// Convert Packed BF16 Data to Packed float Data.
@@ -434,8 +434,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m256) __builtin_convertvector(__A, __v8sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -450,8 +449,8 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
- (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -466,8 +465,9 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtpbh_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -485,9 +485,8 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
- (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -505,9 +504,8 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
- (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m256)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index 3f544d387f7aa..dfaae4e459f7e 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -79,23 +79,20 @@ __m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U)
__m512 test_mm512_cvtpbh_ps(__m256bh A) {
// CHECK-LABEL: test_mm512_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
return _mm512_cvtpbh_ps(A);
}
__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_cvtpbh_ps(M, A);
}
__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_mask_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_cvtpbh_ps(S, M, A);
}
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index d59b254520774..80afcd7a490db 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -156,46 +156,43 @@ __bf16 test_mm_cvtness_sbh(float A) {
__m128 test_mm_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm_cvtpbh_ps(A);
}
__m256 test_mm256_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm256_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
return _mm256_cvtpbh_ps(A);
}
__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_maskz_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_cvtpbh_ps(M, A);
}
__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_cvtpbh_ps(M, A);
}
__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_mask_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_cvtpbh_ps(S, M, A);
}
__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_mask_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_cvtpbh_ps(S, M, A);
}
|
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesLet standard casting / builtin_convertvector handle the conversions from BF16 to F32 My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen. First part of #154911 Full diff: https://github.com/llvm/llvm-project/pull/169781.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index fcc3957f9b8ab..f0991a1d82dfc 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -3358,10 +3358,6 @@ let Features = "avx512bf16", Attributes = [NoThrow, Const, RequiredVectorWidth<5
def dpbf16ps_512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, __bf16>, _Vector<32, __bf16>)">;
}
-let Features = "avx512bf16", Attributes = [NoThrow, Const] in {
- def cvtsbf162ss_32 : X86Builtin<"float(__bf16)">;
-}
-
let Features = "avx512vp2intersect", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
def vp2intersect_q_512 : X86Builtin<"void(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char *, unsigned char *)">;
}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index 00c8a1cf16e31..1b4ab08588602 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2796,8 +2796,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
}
- case X86::BI__builtin_ia32_cvtsbf162ss_32:
- return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 3973f0e389685..9a21d8680045a 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -36,7 +36,7 @@ typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
- return __builtin_ia32_cvtsbf162ss_32(__A);
+ return (float)(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -236,8 +236,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512) __builtin_convertvector(__A, __v16sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -252,8 +251,9 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtpbh_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -270,9 +270,8 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
- (__m512i)__S, (__mmask16)__U,
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+ return (__m512)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_cvtpbh_ps(__A), (__v16sf)__S);
}
#undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 2d7ea0114d6a5..0e3184554a6c8 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -422,8 +422,8 @@ static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
- return _mm_castsi128_ps(
- (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m128)_mm256_castps256_ps128(
+ (__m256) __builtin_convertvector(__A, __v8sf));
}
/// Convert Packed BF16 Data to Packed float Data.
@@ -434,8 +434,7 @@ static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+ return (__m256) __builtin_convertvector(__A, __v8sf);
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -450,8 +449,8 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
- (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)_mm_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
@@ -466,8 +465,9 @@ _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtpbh_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -485,9 +485,8 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
- (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m128)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_cvtpbh_ps(__A), (__v4sf)__S);
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
@@ -505,9 +504,8 @@ _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
- (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
- 16));
+ return (__m256)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_cvtpbh_ps(__A), (__v8sf)__S);
}
#undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c
index 3f544d387f7aa..dfaae4e459f7e 100644
--- a/clang/test/CodeGen/X86/avx512bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c
@@ -79,23 +79,20 @@ __m512 test_mm512_mask_dpbf16_ps(__m512 D, __m512bh A, __m512bh B, __mmask16 U)
__m512 test_mm512_cvtpbh_ps(__m256bh A) {
// CHECK-LABEL: test_mm512_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
return _mm512_cvtpbh_ps(A);
}
__m512 test_mm512_maskz_cvtpbh_ps(__mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_maskz_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_cvtpbh_ps(M, A);
}
__m512 test_mm512_mask_cvtpbh_ps(__m512 S, __mmask16 M, __m256bh A) {
// CHECK-LABEL: test_mm512_mask_cvtpbh_ps
- // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
- // CHECK: call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+ // CHECK: fpext <16 x bfloat> %{{.*}} to <16 x float>
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_cvtpbh_ps(S, M, A);
}
diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
index d59b254520774..80afcd7a490db 100644
--- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c
@@ -156,46 +156,43 @@ __bf16 test_mm_cvtness_sbh(float A) {
__m128 test_mm_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
return _mm_cvtpbh_ps(A);
}
__m256 test_mm256_cvtpbh_ps(__m128bh A) {
// CHECK-LABEL: test_mm256_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
return _mm256_cvtpbh_ps(A);
}
__m128 test_mm_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_maskz_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_maskz_cvtpbh_ps(M, A);
}
__m256 test_mm256_maskz_cvtpbh_ps(__mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_maskz_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_cvtpbh_ps(M, A);
}
__m128 test_mm_mask_cvtpbh_ps(__m128 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm_mask_cvtpbh_ps
- // CHECK: sext <4 x i16> %{{.*}} to <4 x i32>
- // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
return _mm_mask_cvtpbh_ps(S, M, A);
}
__m256 test_mm256_mask_cvtpbh_ps(__m256 S, __mmask8 M, __m128bh A) {
// CHECK-LABEL: test_mm256_mask_cvtpbh_ps
- // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
- // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
- // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+ // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float>
+ // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_mask_cvtpbh_ps(S, M, A);
}
|
| // CHECK-LABEL: test_mm_cvtpbh_ps | ||
| // CHECK: sext <4 x i16> %{{.*}} to <4 x i32> | ||
| // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}}) | ||
| // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doubles the vector size. The same for mask/maskz.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That was my query in the patch summary :) Final codegen is fine, but I wanted to see what everyone's preference was in the implementation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, we can optimize it back to <4 x bfloat>. The difference is still in O0, but I think we don't care the performance for it.
phoebewang
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
| // CHECK-LABEL: test_mm_cvtpbh_ps | ||
| // CHECK: sext <4 x i16> %{{.*}} to <4 x i32> | ||
| // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}}) | ||
| // CHECK: fpext <8 x bfloat> %{{.*}} to <8 x float> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, we can optimize it back to <4 x bfloat>. The difference is still in O0, but I think we don't care the performance for it.
Let standard casting / builtin_convertvector handle the conversions from BF16 to F32
My only query is how to best implement _mm_cvtpbh_ps - I went for the v8bf16 -> v8f32 conversion followed by subvector extraction in the end, but could just as easily extract a v4bf16 first - makes no difference to final codegen.
First part of #154911