diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 7a14c6ec21a1a..272704c8451a7 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -2464,7 +2464,8 @@ let Features = "avx512vl", def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">; def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">; @@ -2476,7 +2477,8 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5a96320e12b6f..8e0d254bdddc9 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4718,6 +4718,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshr); + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + // Destination and sources A, B all have the same type. + QualType VecQT = Call->getArg(0)->getType(); + const auto *VecT = VecQT->castAs(); + unsigned NumElems = VecT->getNumElements(); + unsigned ElemBits = S.getASTContext().getTypeSize(VecT->getElementType()); + constexpr unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return std::pair{SrcIdx, IdxToPick}; + }); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 74f6e3acb6b39..999669f7539cd 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -50,6 +50,7 @@ #include "clang/AST/RecordLayout.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/Type.h" +#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticSema.h" @@ -13461,6 +13462,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + APValue SourceA, SourceB; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB)) + return false; + + APSInt Imm; + if (!EvaluateInteger(E->getArg(2), Imm, Info)) + return false; + + // Destination and sources A, B all have the same type. + unsigned NumElems = SourceA.getVectorLength(); + const VectorType *VT = E->getArg(0)->getType()->castAs(); + QualType ElemQT = VT->getElementType(); + unsigned ElemBits = Info.Ctx.getTypeSize(ElemQT); + constexpr unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + unsigned DstLen = SourceA.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(DstLen); + + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) + -> std::pair { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return {SrcIdx, IdxToPick}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index eb25aa538e9a3..4881700bb5a66 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6747,6 +6747,7 @@ __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> return _mm512_shuffle_f32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_shuffle_f32x4(((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 13.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f32x4 @@ -6767,6 +6768,7 @@ __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> return _mm512_shuffle_f64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_shuffle_f64x2(((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 1.0, 2.0, 7.0, 8.0, 50.0, 60.0, 50.0, 60.0)); __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f64x2 @@ -6787,6 +6789,8 @@ __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> return _mm512_shuffle_i32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_shuffle_i32x4(((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 3, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 30, 40)); + __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i32x4 @@ -6807,6 +6811,7 @@ __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> return _mm512_shuffle_i64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_shuffle_i64x2(((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 6, 3, 4, 10, 20, 70, 80)); __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i64x2 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index e05b1ddf7b69a..aff775c7f7b3c 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -8989,6 +8989,7 @@ __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_shuffle_f32x4(((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 5.0, 6.0, 7.0, 8.0, 10.0, 20.0, 30.0, 40.0)); __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f32x4 @@ -9009,6 +9010,7 @@ __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_shuffle_f64x2(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 30.0, 40.0)); __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f64x2 @@ -9031,6 +9033,7 @@ __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_shuffle_i32x4(((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 1, 2, 3, 4, 10, 20, 30, 40)); __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i32x4 @@ -9051,6 +9054,7 @@ __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_shuffle_i64x2(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 1ULL, 2ULL, 30ULL, 40ULL)); __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i64x2