[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow shufps/pd shuffles intrinsics to be used in constexpr #164078

chaitanyav · 2025-10-18T12:37:41Z

Resolves #161208

llvmbot · 2025-10-18T13:21:09Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-x86

Author: NagaChaitanya Vellanki (chaitanyav)

Changes

Resolves #161208

Full diff: https://github.com/llvm/llvm-project/pull/164078.diff

8 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (+13-4)
(modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+54)
(modified) clang/lib/AST/ExprConstant.cpp (+66-1)
(modified) clang/test/CodeGen/X86/avx-builtins.c (+4)
(modified) clang/test/CodeGen/X86/avx512f-builtins.c (+5-1)
(modified) clang/test/CodeGen/X86/avx512vl-builtins.c (+6-1)
(modified) clang/test/CodeGen/X86/sse-builtins.c (+5)
(modified) clang/test/CodeGen/X86/sse2-builtins.c (+5)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 62c70fba946be..c0515f8241dda 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -197,6 +197,10 @@ let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDecl
   def _mm_sfence : X86LibBuiltin<"void()">;
 }
 
+let Features = "sse", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+}
+
 let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
@@ -204,7 +208,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
@@ -224,13 +227,13 @@ let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
   def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
   def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
@@ -488,13 +491,16 @@ let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVecto
   def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
 }
 
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+}
+
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
   def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
   def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
   def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
-  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
-  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
   def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
   def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -2470,6 +2476,9 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
   def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
   def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
   def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
   def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5838cf8ba7438..1a0a5afc023c8 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3128,6 +3128,52 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_shuf(InterpState &S, CodePtr OpPC,
+                                      const CallExpr *Call) {
+  assert(Call->getNumArgs() == 3);
+
+  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+  QualType Arg0Type = Call->getArg(0)->getType();
+  const auto *VecT = Arg0Type->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VecT->getElementType());
+  unsigned NumElems = VecT->getNumElements();
+  unsigned LaneWidth = S.getContext().getBitWidth(VecT->getElementType());
+  unsigned NumLanes = LaneWidth * NumElems / 128;
+  unsigned NumElemPerLane = 128 / LaneWidth;
+
+  const Pointer &B = S.Stk.pop<Pointer>();
+  const Pointer &A = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned NumSelectableElems = NumElemPerLane / 2;
+  unsigned BitsPerElem = NumSelectableElems == 1 ? 1 : 2;
+  unsigned IndexMask = BitsPerElem == 2 ? 0x3 : 0x1;
+  unsigned MaskBits = 8;
+
+  TYPE_SWITCH(ElemT, {
+    unsigned BitIndex = 0;
+    unsigned DstIdx = 0;
+
+    for (unsigned LaneId = 0; LaneId != NumLanes; ++LaneId) {
+      unsigned LaneOffset = LaneId * NumElemPerLane;
+
+      for (unsigned i = 0; i < NumSelectableElems; ++i) {
+        unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+        Dst.elem<T>(DstIdx++) = A.elem<T>(LaneOffset + Index);
+        BitIndex = (BitIndex + BitsPerElem) % MaskBits;
+      }
+
+      for (unsigned i = 0; i < NumSelectableElems; ++i) {
+        unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+        Dst.elem<T>(DstIdx++) = B.elem<T>(LaneOffset + Index);
+        BitIndex = (BitIndex + BitsPerElem) % MaskBits;
+      }
+    }
+  });
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -4003,6 +4049,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_selectpd_512:
     return interp__builtin_select(S, OpPC, Call);
 
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_shufpd512:
+    return interp__builtin_ia32_shuf(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 16141b27f4ce8..4a48d5b6d730f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11618,6 +11618,61 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
   return true;
 }
 
+static bool evalShufpspdBuiltin(EvalInfo &Info, const CallExpr *Call,
+                                APValue &Out) {
+  APValue A, B;
+  APSInt ShuffleMask;
+  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+      !EvaluateAsRValue(Info, Call->getArg(1), B) ||
+      !EvaluateInteger(Call->getArg(2), ShuffleMask, Info))
+    return false;
+
+  const auto *VT = Call->getType()->getAs<VectorType>();
+  if (!VT)
+    return false;
+
+  QualType ElemT = VT->getElementType();
+  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
+  unsigned NumElts = VT->getNumElements();
+
+  constexpr unsigned LaneBits = 128u;
+  unsigned NumElemPerLane = LaneBits / ElemBits;
+  if (!NumElemPerLane || (NumElts % NumElemPerLane) != 0)
+    return false;
+
+  unsigned NumLanes = NumElts / NumElemPerLane;
+  uint8_t Ctl = static_cast<uint8_t>(ShuffleMask.getZExtValue());
+
+  unsigned SelectableElts = NumElemPerLane / 2;
+  unsigned BitsPerSel = SelectableElts == 1 ? 1 : 2;
+  unsigned SelMask = (1u << BitsPerSel) - 1;
+  unsigned MaskBits = 8;
+
+  SmallVector<APValue, 16> ResultElements;
+  ResultElements.reserve(NumElts);
+
+  unsigned BitIdx = 0;
+
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    unsigned LaneBase = Lane * NumElemPerLane;
+
+    for (unsigned i = 0; i < SelectableElts; ++i) {
+      unsigned SelIdx = (Ctl >> BitIdx) & SelMask;
+      ResultElements.push_back(A.getVectorElt(LaneBase + SelIdx));
+      BitIdx = (BitIdx + BitsPerSel) % MaskBits;
+    }
+
+    for (unsigned i = 0; i < SelectableElts; ++i) {
+      unsigned SelIdx = (Ctl >> BitIdx) & SelMask;
+      ResultElements.push_back(B.getVectorElt(LaneBase + SelIdx));
+      BitIdx = (BitIdx + BitsPerSel) % MaskBits;
+    }
+  }
+
+  Out = APValue(ResultElements.data(), ResultElements.size());
+  return true;
+}
+
 static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
                               APValue &Out) {
   APValue SrcVec, ControlVec;
@@ -12308,7 +12363,17 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
-
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_shufpd512: {
+    APValue R;
+    if (!evalShufpspdBuiltin(Info, E, R))
+      return false;
+    return Success(R, E);
+  }
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512: {
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index bcffd861fd7f7..7f5c2f56d07c3 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1881,12 +1881,16 @@ __m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
   return _mm256_shuffle_pd(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m256d(_mm256_shuffle_pd(((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));
+
 __m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_shuffle_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   return _mm256_shuffle_ps(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m256(_mm256_shuffle_ps(((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));
+
 __m256d test_mm256_sqrt_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_sqrt_pd
   // CHECK: call {{.*}}<4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 3deaf8efc9632..59d8f581ec38b 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -6735,9 +6735,13 @@ __m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
   // CHECK-LABEL: test_mm512_maskz_shuffle_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4); 
+  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4);
 }
 
+TEST_CONSTEXPR((match_m512(_mm512_shuffle_ps(((__m512)(__v16sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512)(__v16sf){17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f}), 4), 1.0f, 2.0f, 17.0f, 17.0f, 5.0f, 6.0f, 21.0f, 21.0f, 9.0f, 10.0f, 25.0f, 25.0f, 13.0f, 14.0f, 29.0f, 29.0f)));
+TEST_CONSTEXPR((match_m512d(_mm512_shuffle_pd(((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));
+TEST_CONSTEXPR((match_m512d(_mm512_maskz_shuffle_pd(0xFF, ((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));
+
 __m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_sqrt_round_sd
   // CHECK: call {{.*}}<2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 -1, i32 11)
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 9b6bfea918191..51a91789e65dd 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -8933,9 +8933,14 @@ __m256 test_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: test_mm256_maskz_shuffle_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
-  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4); 
+  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4);
 }
 
+TEST_CONSTEXPR((match_m128d(_mm_maskz_shuffle_pd(0x3, ((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
+TEST_CONSTEXPR((match_m256d(_mm256_maskz_shuffle_pd(0xF, ((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));
+TEST_CONSTEXPR((match_m128(_mm_maskz_shuffle_ps(0xF, ((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m256(_mm256_maskz_shuffle_ps(0xFF, ((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));
+
 __m128d test_mm_rsqrt14_pd(__m128d __A) {
   // CHECK-LABEL: test_mm_rsqrt14_pd
   // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 3bad3426b1586..889c820709870 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -736,6 +736,11 @@ __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
   return _mm_shuffle_ps(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 0), 1.0f, 1.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 255), 4.0f, 4.0f, 8.0f, 8.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 27), 4.0f, 3.0f, 6.0f, 5.0f)));
+
 __m128 test_mm_sqrt_ps(__m128 x) {
   // CHECK-LABEL: test_mm_sqrt_ps
   // CHECK: call {{.*}}<4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index ade7ef39a008a..a6477dd530242 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1309,6 +1309,11 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   return _mm_shuffle_pd(A, B, 1);
 }
 
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 0), 1.0, 3.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 1), 2.0, 3.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 2), 1.0, 4.0)));
+
 __m128i test_mm_shufflehi_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_shufflehi_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>

clang/lib/AST/ByteCode/InterpBuiltin.cpp

tbaederr · 2025-10-21T04:53:50Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+  unsigned IndexMask = BitsPerElem == 2 ? 0x3 : 0x1;
+  unsigned MaskBits = 8;
+
+  TYPE_SWITCH(ElemT, {


Try to minimize what's in the TYPE_SWITCH afaics, it's only one assignment per loop, so two small type switches would be better.

RKSimon · 2025-10-21T14:06:45Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+  const auto *VecT = Arg0Type->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VecT->getElementType());
+  unsigned NumElems = VecT->getNumElements();
+  unsigned LaneWidth = S.getContext().getBitWidth(VecT->getElementType());


LaneWidth -> ElemWidth (Lanes in x86 mean 128-bit subvectors)

RKSimon

We seem to be creating a lot of shuffle decoding helper functions - it'd be great if some of them could share more of the code, or maybe even convert some to a callback system? The callback returns the shuffle source index given the immediate mask and the element index.

github-actions · 2025-10-21T17:49:27Z

✅ With the latest revision this PR passed the C/C++ code formatter.

chaitanyav · 2025-10-22T04:21:25Z

@RKSimon am looking at the refactoring.

RKSimon

Nice! Few thoughts, but I like the approach :)

RKSimon · 2025-10-23T09:53:11Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call,
+        [](unsigned DstIdx, unsigned ShuffleMask, unsigned ElementSize) {
+          unsigned NumElemPerLane = 128 / ElementSize;


We could drop the ElementSize argument and simplify this callback code if we split the shufps/shufpd callbacks

RKSimon · 2025-10-23T09:55:10Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+    if (IsMaskVector) {


drop this mask handling until its actually used in a future PR - simple first implementation that we can build on when we need to

RKSimon · 2025-10-23T09:56:46Z

clang/lib/AST/ExprConstant.cpp

+  return true;
+}
+
+static bool evalShufpspdBuiltin(EvalInfo &Info, const CallExpr *Call,


Similar to interpbuiltin - split the shufps/pd callbacks and this should allow us to avoid the evalShufpspdBuiltin wrapper entirely.

… shufps/pd shuffles intrinsics to be used in constexpr * A generic shuffle helper function is introduced to reduce code duplication and facilitate future extensions to other shuffle intrinsics Resolves llvm#161208

chaitanyav · 2025-10-24T00:59:05Z

@RKSimon made changes as per the comments.

chaitanyav marked this pull request as ready for review October 18, 2025 13:20

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:bytecode Issues for the clang bytecode constexpr interpreter labels Oct 18, 2025

chaitanyav requested review from RKSimon and tbaederr October 18, 2025 13:21

tbaederr reviewed Oct 21, 2025

View reviewed changes

RKSimon reviewed Oct 21, 2025

View reviewed changes

chaitanyav force-pushed the issue_161208 branch from bbee445 to 6194583 Compare October 21, 2025 17:47

chaitanyav force-pushed the issue_161208 branch 2 times, most recently from af8846b to df9764f Compare October 22, 2025 03:06

chaitanyav self-assigned this Oct 22, 2025

RKSimon mentioned this pull request Oct 22, 2025

[X86][ByteCode] Allow PSHUFB intrinsics to be used in constexpr #156612 #163148

Merged

chaitanyav force-pushed the issue_161208 branch from df9764f to 1f22c12 Compare October 23, 2025 00:26

RKSimon self-requested a review October 23, 2025 08:38

RKSimon requested changes Oct 23, 2025

View reviewed changes

chaitanyav force-pushed the issue_161208 branch from 1f22c12 to 9cac506 Compare October 24, 2025 00:49

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow shufps/pd shuffles intrinsics to be used in constexpr #164078

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow shufps/pd shuffles intrinsics to be used in constexpr #164078

chaitanyav commented Oct 18, 2025

Uh oh!

llvmbot commented Oct 18, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

tbaederr Oct 21, 2025

Uh oh!

RKSimon Oct 21, 2025

Uh oh!

RKSimon left a comment

Uh oh!

github-actions bot commented Oct 21, 2025 •

edited

Loading

Uh oh!

chaitanyav commented Oct 22, 2025

Uh oh!

RKSimon left a comment

Uh oh!

RKSimon Oct 23, 2025

Uh oh!

RKSimon Oct 23, 2025

Uh oh!

RKSimon Oct 23, 2025

Uh oh!

chaitanyav commented Oct 24, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow shufps/pd shuffles intrinsics to be used in constexpr #164078

Are you sure you want to change the base?

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow shufps/pd shuffles intrinsics to be used in constexpr #164078

Conversation

chaitanyav commented Oct 18, 2025

Uh oh!

llvmbot commented Oct 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

tbaederr Oct 21, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 21, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

chaitanyav commented Oct 22, 2025

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 23, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 23, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon Oct 23, 2025

Choose a reason for hiding this comment

Uh oh!

chaitanyav commented Oct 24, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

llvmbot commented Oct 18, 2025 •

edited

Loading

github-actions bot commented Oct 21, 2025 •

edited

Loading