Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,17 @@ let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDecl
def _mm_sfence : X86LibBuiltin<"void()">;
}

let Features = "sse", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}

let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
}

let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
Expand All @@ -222,13 +225,13 @@ let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
}

let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
Expand Down Expand Up @@ -488,13 +491,16 @@ let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVecto
def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
}

let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
}

let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
Expand Down Expand Up @@ -2474,6 +2480,9 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
}

let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
}
Expand Down
63 changes: 63 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3352,6 +3352,33 @@ static bool interp__builtin_x86_byteshift(
return true;
}

static bool interp__builtin_ia32_shuffle_generic(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
GetSourceIndex) {

assert(Call->getNumArgs() == 3);
unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();

QualType Arg0Type = Call->getArg(0)->getType();
const auto *VecT = Arg0Type->castAs<VectorType>();
PrimType ElemT = *S.getContext().classify(VecT->getElementType());
unsigned NumElems = VecT->getNumElements();

const Pointer &B = S.Stk.pop<Pointer>();
const Pointer &A = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
}
Dst.initializeAllElements();

return true;
}

bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
Expand Down Expand Up @@ -4282,6 +4309,42 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);

case X86::BI__builtin_ia32_shufps:
case X86::BI__builtin_ia32_shufps256:
case X86::BI__builtin_ia32_shufps512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
unsigned NumElemPerLane = 4;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 2;
unsigned IndexMask = 0x3;
unsigned MaskBits = 8;
unsigned Lane = DstIdx / NumElemPerLane;
unsigned ElemInLane = DstIdx % NumElemPerLane;
unsigned LaneOffset = Lane * NumElemPerLane;
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
});
case X86::BI__builtin_ia32_shufpd:
case X86::BI__builtin_ia32_shufpd256:
case X86::BI__builtin_ia32_shufpd512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
unsigned NumElemPerLane = 2;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 1;
unsigned IndexMask = 0x1;
unsigned MaskBits = 8;
unsigned Lane = DstIdx / NumElemPerLane;
unsigned ElemInLane = DstIdx % NumElemPerLane;
unsigned LaneOffset = Lane * NumElemPerLane;
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
});
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
Expand Down
84 changes: 83 additions & 1 deletion clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11619,6 +11619,39 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
return true;
}

static bool evalShuffleGeneric(
EvalInfo &Info, const CallExpr *Call, APValue &Out,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
GetSourceIndex) {

const auto *VT = Call->getType()->getAs<VectorType>();
if (!VT)
return false;

APSInt MaskImm;
if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
return false;
unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());

APValue A, B;
if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
!EvaluateAsRValue(Info, Call->getArg(1), B))
return false;

unsigned NumElts = VT->getNumElements();
SmallVector<APValue, 16> ResultElements;
ResultElements.reserve(NumElts);

for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
}

Out = APValue(ResultElements.data(), ResultElements.size());
return true;
}

static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
APValue &Out) {
APValue SrcVec, ControlVec;
Expand Down Expand Up @@ -12398,7 +12431,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case X86::BI__builtin_ia32_shufps:
case X86::BI__builtin_ia32_shufps256:
case X86::BI__builtin_ia32_shufps512: {
APValue R;
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx,
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
constexpr unsigned LaneBits = 128u;
unsigned NumElemPerLane = LaneBits / 32;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 2;
unsigned IndexMask = (1u << BitsPerElem) - 1;
unsigned MaskBits = 8;
unsigned Lane = DstIdx / NumElemPerLane;
unsigned ElemInLane = DstIdx % NumElemPerLane;
unsigned LaneOffset = Lane * NumElemPerLane;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return {SrcIdx, LaneOffset + Index};
}))
return false;
return Success(R, E);
}
case X86::BI__builtin_ia32_shufpd:
case X86::BI__builtin_ia32_shufpd256:
case X86::BI__builtin_ia32_shufpd512: {
APValue R;
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx,
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
constexpr unsigned LaneBits = 128u;
unsigned NumElemPerLane = LaneBits / 64;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 1;
unsigned IndexMask = (1u << BitsPerElem) - 1;
unsigned MaskBits = 8;
unsigned Lane = DstIdx / NumElemPerLane;
unsigned ElemInLane = DstIdx % NumElemPerLane;
unsigned LaneOffset = Lane * NumElemPerLane;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return {SrcIdx, LaneOffset + Index};
}))
return false;
return Success(R, E);
}
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512: {
Expand Down
4 changes: 4 additions & 0 deletions clang/test/CodeGen/X86/avx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1891,12 +1891,16 @@ __m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
return _mm256_shuffle_pd(A, B, 0);
}

TEST_CONSTEXPR((match_m256d(_mm256_shuffle_pd(((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));

__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
// CHECK-LABEL: test_mm256_shuffle_ps
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
return _mm256_shuffle_ps(A, B, 0);
}

TEST_CONSTEXPR((match_m256(_mm256_shuffle_ps(((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));

__m256d test_mm256_sqrt_pd(__m256d A) {
// CHECK-LABEL: test_mm256_sqrt_pd
// CHECK: call {{.*}}<4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.*}})
Expand Down
6 changes: 5 additions & 1 deletion clang/test/CodeGen/X86/avx512f-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -6741,9 +6741,13 @@ __m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
// CHECK-LABEL: test_mm512_maskz_shuffle_ps
// CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
// CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_maskz_shuffle_ps(__U, __M, __V, 4);
return _mm512_maskz_shuffle_ps(__U, __M, __V, 4);
}

TEST_CONSTEXPR((match_m512(_mm512_shuffle_ps(((__m512)(__v16sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512)(__v16sf){17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f}), 4), 1.0f, 2.0f, 17.0f, 17.0f, 5.0f, 6.0f, 21.0f, 21.0f, 9.0f, 10.0f, 25.0f, 25.0f, 13.0f, 14.0f, 29.0f, 29.0f)));
TEST_CONSTEXPR((match_m512d(_mm512_shuffle_pd(((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));
TEST_CONSTEXPR((match_m512d(_mm512_maskz_shuffle_pd(0xFF, ((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));

__m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
// CHECK-LABEL: test_mm_sqrt_round_sd
// CHECK: call {{.*}}<2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 -1, i32 11)
Expand Down
7 changes: 6 additions & 1 deletion clang/test/CodeGen/X86/avx512vl-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -8933,9 +8933,14 @@ __m256 test_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B) {
// CHECK-LABEL: test_mm256_maskz_shuffle_ps
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
return _mm256_maskz_shuffle_ps(__U, __A, __B, 4);
return _mm256_maskz_shuffle_ps(__U, __A, __B, 4);
}

TEST_CONSTEXPR((match_m128d(_mm_maskz_shuffle_pd(0x3, ((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
TEST_CONSTEXPR((match_m256d(_mm256_maskz_shuffle_pd(0xF, ((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));
TEST_CONSTEXPR((match_m128(_mm_maskz_shuffle_ps(0xF, ((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
TEST_CONSTEXPR((match_m256(_mm256_maskz_shuffle_ps(0xFF, ((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));

__m128d test_mm_rsqrt14_pd(__m128d __A) {
// CHECK-LABEL: test_mm_rsqrt14_pd
// CHECK: @llvm.x86.avx512.rsqrt14.pd.128
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGen/X86/sse-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,11 @@ __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
return _mm_shuffle_ps(A, B, 0);
}

TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 0), 1.0f, 1.0f, 5.0f, 5.0f)));
TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 255), 4.0f, 4.0f, 8.0f, 8.0f)));
TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 27), 4.0f, 3.0f, 6.0f, 5.0f)));

__m128 test_mm_sqrt_ps(__m128 x) {
// CHECK-LABEL: test_mm_sqrt_ps
// CHECK: call {{.*}}<4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
Expand Down
5 changes: 5 additions & 0 deletions clang/test/CodeGen/X86/sse2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1314,6 +1314,11 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
return _mm_shuffle_pd(A, B, 1);
}

TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 0), 1.0, 3.0)));
TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 1), 2.0, 3.0)));
TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 2), 1.0, 4.0)));

__m128i test_mm_shufflehi_epi16(__m128i A) {
// CHECK-LABEL: test_mm_shufflehi_epi16
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
Expand Down
Loading