diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 792e2e07ec594..37e6250e7c587 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5253,3 +5253,11 @@ def CountedByRef : Builtin {
   let Attributes = [NoThrow, CustomTypeChecking];
   let Prototype = "int(...)";
 }
+
+// Constant-time select builtin
+def CtSelect : Builtin {
+  let Spellings = ["__builtin_ct_select"];
+  let Attributes = [NoThrow, UnevaluatedArguments,
+                    ConstIgnoringExceptions, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 18641a96063cd..17993660ba395 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -872,6 +872,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("ssve-fp8fma", HasSSVE_FP8FMA)
       .Case("sme-f8f32", HasSME_F8F32)
       .Case("sme-f8f16", HasSME_F8F16)
+      .Case("ctselect", true)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 3de17d2c829f1..423a7b8749658 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -664,6 +664,7 @@ bool ARMTargetInfo::hasFeature(StringRef Feature) const {
       .Case("hwdiv", HWDiv & HWDivThumb)
       .Case("hwdiv-arm", HWDiv & HWDivARM)
       .Case("mve", hasMVE())
+      .Case("ctselect", true)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index e71f10c4c16fc..45fa0379783fe 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1298,6 +1298,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("cf", HasCF)
       .Case("zu", HasZU)
       .Case("branch-hint", HasBranchHint)
+      .Case("ctselect", true)
       .Default(false);
 }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 92dba32698e51..25b95ce0289b7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -26,6 +26,10 @@
 #include "TargetInfo.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/OperationKinds.h"
+#include "clang/AST/Type.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/IR/InlineAsm.h"
@@ -6441,6 +6445,40 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     auto Str = CGM.GetAddrOfConstantCString(Name, "");
     return RValue::get(Str.getPointer());
   }
+  case Builtin::BI__builtin_ct_select: {
+    if (E->getNumArgs() != 3) {
+      CGM.getDiags().Report(E->getBeginLoc(),
+                            E->getNumArgs() > 3
+                                ? diag::err_typecheck_call_too_many_args
+                                : diag::err_typecheck_call_too_few_args);
+      return GetUndefRValue(E->getType());
+    }
+
+    auto *Cond = EmitScalarExpr(E->getArg(0));
+    auto *A = EmitScalarExpr(E->getArg(1));
+    auto *B = EmitScalarExpr(E->getArg(2));
+
+    // Verify types match
+    if (A->getType() != B->getType()) {
+      CGM.getDiags().Report(E->getBeginLoc(),
+                            diag::err_typecheck_convert_incompatible);
+      return GetUndefRValue(E->getType());
+    }
+
+    // Verify condition is integer type
+    if (!Cond->getType()->isIntegerTy()) {
+      CGM.getDiags().Report(E->getBeginLoc(), diag::err_typecheck_expect_int);
+      return GetUndefRValue(E->getType());
+    }
+
+    if (Cond->getType()->getIntegerBitWidth() != 1)
+      Cond = Builder.CreateICmpNE(
+          Cond, llvm::ConstantInt::get(Cond->getType(), 0), "cond.bool");
+
+    llvm::Function *Fn =
+        CGM.getIntrinsic(llvm::Intrinsic::ct_select, {A->getType()});
+    return RValue::get(Builder.CreateCall(Fn, {Cond, A, B}));
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 652527a88b160..d7c283367353c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3472,6 +3472,93 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (BuiltinCountedByRef(TheCall))
       return ExprError();
     break;
+
+  case Builtin::BI__builtin_ct_select: {
+    if (TheCall->getNumArgs() != 3) {
+      // Simple argument count check without complex diagnostics
+      if (TheCall->getNumArgs() < 3) {
+        return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_few_args_at_least)
+               << 0 << 3 << TheCall->getNumArgs() << 0
+               << TheCall->getCallee()->getSourceRange();
+      } else {
+        return Diag(TheCall->getEndLoc(), diag::err_typecheck_call_too_many_args)
+               << 0 << 3 << TheCall->getNumArgs() << 0
+               << TheCall->getCallee()->getSourceRange();
+      }
+    }
+    auto *Cond = TheCall->getArg(0);
+    auto *A = TheCall->getArg(1);
+    auto *B = TheCall->getArg(2);
+
+    QualType CondTy = Cond->getType();
+    if (!CondTy->isIntegerType()) {
+      return Diag(Cond->getBeginLoc(), diag::err_typecheck_cond_expect_scalar)
+             << CondTy << Cond->getSourceRange();
+    }
+
+    QualType ATy = A->getType();
+    QualType BTy = B->getType();
+
+    // check for scalar or vector scalar type
+    if ((!ATy->isScalarType() && !ATy->isVectorType()) ||
+        (!BTy->isScalarType() && !BTy->isVectorType())) {
+      return Diag(A->getBeginLoc(),
+                  diag::err_typecheck_cond_incompatible_operands)
+             << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+    }
+
+    // Check if both operands have the same type or can be implicitly converted
+    QualType ResultTy;
+    if (Context.hasSameType(ATy, BTy)) {
+      ResultTy = ATy;
+    } else {
+      // Try to find a common type using the same logic as conditional
+      // expressions
+      ExprResult ARes = ExprResult(A);
+      ExprResult BRes = ExprResult(B);
+
+      // For arithmetic types, allow promotions within the same category only
+      if (ATy->isArithmeticType() && BTy->isArithmeticType()) {
+        // Check if both are integer types or both are floating types
+        bool AIsInteger = ATy->isIntegerType();
+        bool BIsInteger = BTy->isIntegerType();
+        bool AIsFloating = ATy->isFloatingType();
+        bool BIsFloating = BTy->isFloatingType();
+
+        if ((AIsInteger && BIsInteger) || (AIsFloating && BIsFloating)) {
+          // Both are in the same category, allow usual arithmetic conversions
+          ResultTy = UsualArithmeticConversions(
+              ARes, BRes, TheCall->getBeginLoc(), ArithConvKind::Conditional);
+          if (ARes.isInvalid() || BRes.isInvalid() || ResultTy.isNull()) {
+            return Diag(A->getBeginLoc(),
+                        diag::err_typecheck_cond_incompatible_operands)
+                   << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+          }
+          // Update the arguments with any necessary implicit casts
+          TheCall->setArg(1, ARes.get());
+          TheCall->setArg(2, BRes.get());
+        } else {
+          // Different categories (int vs float), not allowed
+          return Diag(A->getBeginLoc(),
+                      diag::err_typecheck_cond_incompatible_operands)
+                 << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+        }
+      } else {
+        // For non-arithmetic types, they must be exactly the same
+        return Diag(A->getBeginLoc(),
+                    diag::err_typecheck_cond_incompatible_operands)
+               << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+      }
+    }
+
+    ExprResult CondRes = PerformContextuallyConvertToBool(Cond);
+    if (CondRes.isInvalid())
+      return ExprError();
+
+    TheCall->setArg(0, CondRes.get());
+    TheCall->setType(ResultTy);
+    return TheCall;
+  } break;
   }
 
   if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall))
diff --git a/clang/test/Sema/builtin-ct-select-edge-cases.c b/clang/test/Sema/builtin-ct-select-edge-cases.c
new file mode 100644
index 0000000000000..3998e9d68748d
--- /dev/null
+++ b/clang/test/Sema/builtin-ct-select-edge-cases.c
@@ -0,0 +1,384 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+
+// Test with various condition expressions
+int test_conditional_expressions(int x, int y, int a, int b) {
+  // Logical expressions
+  int result1 = __builtin_ct_select(x && y, a, b);
+  int result2 = __builtin_ct_select(x || y, a, b);
+  int result3 = __builtin_ct_select(!x, a, b);
+  
+  // Comparison expressions
+  int result4 = __builtin_ct_select(x == y, a, b);
+  int result5 = __builtin_ct_select(x != y, a, b);
+  int result6 = __builtin_ct_select(x < y, a, b);
+  int result7 = __builtin_ct_select(x > y, a, b);
+  int result8 = __builtin_ct_select(x <= y, a, b);
+  int result9 = __builtin_ct_select(x >= y, a, b);
+  
+  // Bitwise expressions
+  int result10 = __builtin_ct_select(x & y, a, b);
+  int result11 = __builtin_ct_select(x | y, a, b);
+  int result12 = __builtin_ct_select(x ^ y, a, b);
+  int result13 = __builtin_ct_select(~x, a, b);
+  
+  // Arithmetic expressions
+  int result14 = __builtin_ct_select(x + y, a, b);
+  int result15 = __builtin_ct_select(x - y, a, b);
+  int result16 = __builtin_ct_select(x * y, a, b);
+  int result17 = __builtin_ct_select(x / y, a, b);
+  int result18 = __builtin_ct_select(x % y, a, b);
+  
+  return result1 + result2 + result3 + result4 + result5 + result6 + result7 + result8 + result9 + result10 + result11 + result12 + result13 + result14 + result15 + result16 + result17 + result18;
+}
+
+// Test with extreme values
+int test_extreme_values(int cond) {
+  // Maximum and minimum values
+  int max_int = __builtin_ct_select(cond, __INT_MAX__, -__INT_MAX__ - 1);
+  
+  // Very large numbers
+  long long max_ll = __builtin_ct_select(cond, __LONG_LONG_MAX__, -__LONG_LONG_MAX__ - 1);
+  
+  // Floating point extremes
+  float max_float = __builtin_ct_select(cond, __FLT_MAX__, -__FLT_MAX__);
+  double max_double = __builtin_ct_select(cond, __DBL_MAX__, -__DBL_MAX__);
+  
+  return max_int;
+}
+
+// Test with zero and negative zero
+int test_zero_values(int cond) {
+  // Integer zeros
+  int zero_int = __builtin_ct_select(cond, 0, -0);
+  
+  // Floating point zeros
+  float zero_float = __builtin_ct_select(cond, 0.0f, -0.0f);
+  double zero_double = __builtin_ct_select(cond, 0.0, -0.0);
+  
+  return zero_int;
+}
+
+// Test with infinity and NaN
+int test_special_float_values(int cond) {
+  // Infinity
+  float inf_float = __builtin_ct_select(cond, __builtin_inff(), -__builtin_inff());
+  double inf_double = __builtin_ct_select(cond, __builtin_inf(), -__builtin_inf());
+  
+  // NaN
+  float nan_float = __builtin_ct_select(cond, __builtin_nanf(""), __builtin_nanf(""));
+  double nan_double = __builtin_ct_select(cond, __builtin_nan(""), __builtin_nan(""));
+  
+  return 0;
+}
+
+// Test with complex pointer scenarios
+int test_pointer_edge_cases(int cond) {
+  int arr[10];
+  int *ptr1 = arr;
+  int *ptr2 = arr + 5;
+  
+  // Array pointers
+  int *result1 = __builtin_ct_select(cond, ptr1, ptr2);
+  
+  // Pointer arithmetic
+  int *result2 = __builtin_ct_select(cond, arr + 1, arr + 2);
+  
+  // NULL vs non-NULL
+  int *result3 = __builtin_ct_select(cond, ptr1, (int*)0);
+  
+  // Different pointer types (should fail)
+  float *fptr = (float*)0;
+  int *result4 = __builtin_ct_select(cond, ptr1, fptr); // expected-error {{incompatible operand types ('int *' and 'float *')}}
+  
+  return *result1;
+}
+
+// Test with function pointers
+int func1(int x) { return x; }
+int func2(int x) { return x * 2; }
+float func3(float x) { return x; }
+
+int test_function_pointers(int cond, int x) {
+  // Same signature function pointer 
+  int (*fptr)(int) = __builtin_ct_select(cond, &func1, &func2);
+  
+  // Different signature function pointers (should fail)
+  int (*bad_fptr)(int) = __builtin_ct_select(cond, &func1, &func3); // expected-error {{incompatible operand types ('int (*)(int)' and 'float (*)(float)')}}
+  
+  return fptr(x);
+}
+
+// Test with void pointers
+void *test_void_pointers(int cond, void *a, void *b) {
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with const/volatile qualifiers
+int test_qualifiers(int cond) {
+  const int ca = 10;
+  const int cb = 20;
+  volatile int va = 30;
+  volatile int vb = 40;
+  const volatile int cva = 50;
+  const volatile int cvb = 60;
+  
+  // const to const
+  const int result1 = __builtin_ct_select(cond, ca, cb);
+  
+  // volatile to volatile
+  volatile int result2 = __builtin_ct_select(cond, va, vb);
+  
+  // const volatile to const volatile
+  const volatile int result3 = __builtin_ct_select(cond, cva, cvb);
+  
+  return result1 + result2 + result3;
+}
+
+// Test with arrays (should fail as they're not arithmetic or pointer)
+int test_arrays(int cond) {
+  int arr1[5] = {1, 2, 3, 4, 5};
+  int arr2[5] = {6, 7, 8, 9, 10};
+  
+  // This should fail??
+  int *result = __builtin_ct_select(cond, arr1, arr2); // expected-error {{incompatible operand types ('int[5]' and 'int[5]')}}
+  
+  return result[0];
+}
+
+// Test with structures (should fail)
+struct Point {
+  int x, y;
+};
+
+struct Point test_structs(int cond) {
+  struct Point p1 = {1, 2};
+  struct Point p2 = {3, 4};
+  
+  return __builtin_ct_select(cond, p1, p2); // expected-error {{incompatible operand types ('struct Point' and 'struct Point')}}
+}
+
+// Test with unions (should fail)
+union Data {
+  int i;
+  float f;
+};
+
+union Data test_unions(int cond) {
+  union Data d1 = {.i = 10};
+  union Data d2 = {.i = 20};
+  
+  return __builtin_ct_select(cond, d1, d2); // expected-error {{incompatible operand types ('union Data' and 'union Data')}}
+}
+
+// Test with bit fields (should work as they're integers)
+struct BitField {
+  int a : 4;
+  int b : 4;
+};
+
+int test_bit_fields(int cond) {
+  struct BitField bf1 = {1, 2};
+  struct BitField bf2 = {3, 4};
+  
+  // Individual bit fields should work
+  int result1 = __builtin_ct_select(cond, bf1.a, bf2.a);
+  int result2 = __builtin_ct_select(cond, bf1.b, bf2.b);
+  
+  return result1 + result2;
+}
+
+// Test with designated initializers
+int test_designated_init(int cond) {
+  int arr1[3] = {[0] = 1, [1] = 2, [2] = 3};
+  int arr2[3] = {[0] = 4, [1] = 5, [2] = 6};
+  
+  // Access specific elements
+  int result1 = __builtin_ct_select(cond, arr1[0], arr2[0]);
+  int result2 = __builtin_ct_select(cond, arr1[1], arr2[1]);
+  
+  return result1 + result2;
+}
+
+// Test with complex expressions in arguments
+int complex_expr(int x) { return x * x; }
+
+int test_complex_arguments(int cond, int x, int y) {
+  // Function calls as arguments
+  int result1 = __builtin_ct_select(cond, complex_expr(x), complex_expr(y));
+  
+  // Ternary operator as arguments
+  int result2 = __builtin_ct_select(cond, x > 0 ? x : -x, y > 0 ? y : -y);
+  
+  // Compound literals
+  int result3 = __builtin_ct_select(cond, (int){x}, (int){y});
+  
+  return result1 + result2 + result3;
+}
+
+// Test with preprocessor macros
+#define MACRO_A 42
+#define MACRO_B 24
+#define MACRO_COND(x) (x > 0)
+
+int test_macros(int x) {
+  int result1 = __builtin_ct_select(MACRO_COND(x), MACRO_A, MACRO_B);
+  
+  // Nested macros
+  #define NESTED_SELECT(c, a, b) __builtin_ct_select(c, a, b)
+  int result2 = NESTED_SELECT(x, 10, 20);
+  
+  return result1 + result2;
+}
+
+// Test with string literals (should fail)
+const char *test_strings(int cond) {
+  return __builtin_ct_select(cond, "hello", "world"); // expected-error {{incompatible operand types ('char[6]' and 'char[6]')}}
+}
+
+// Test with variable length arrays (VLA)
+int test_vla(int cond, int n) {
+  int vla1[n];
+  int vla2[n];
+  
+  // Individual elements should work
+  vla1[0] = 1;
+  vla2[0] = 2;
+  int result = __builtin_ct_select(cond, vla1[0], vla2[0]); 
+  
+  return result;
+}
+
+// Test with typedef
+typedef int MyInt;
+typedef float MyFloat;
+
+MyInt test_typedef(int cond, MyInt a, MyInt b) {
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with different typedef types (should fail)
+MyInt test_different_typedef(int cond, MyInt a, MyFloat b) {
+  return __builtin_ct_select(cond, a, b); // expected-error {{incompatible operand types ('MyInt' (aka 'int') and 'MyFloat' (aka 'float'))}}
+}
+
+// Test with side effects (should be evaluated)
+int side_effect_counter = 0;
+int side_effect_func(int x) {
+  side_effect_counter++;
+  return x;
+}
+
+int test_side_effects(int cond) {
+  // Both arguments should be evaluated
+  int result = __builtin_ct_select(cond, side_effect_func(10), side_effect_func(20));
+  return result;
+}
+
+// Test with goto labels (context where expressions are used)
+int test_goto_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  
+  if (result > 0) {
+    goto positive;
+  } else {
+    goto negative;
+  }
+  
+positive:
+  return result;
+  
+negative:
+  return -result;
+}
+
+// Test with switch statements
+int test_switch_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  
+  switch (result) {
+    case 0:
+      return 0;
+    case 1:
+      return 1;
+    default:
+      return -1;
+  }
+}
+
+// Test with loops
+int test_loop_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  int sum = 0;
+  
+  for (int i = 0; i < result; i++) {
+    sum += i;
+  }
+  
+  return sum;
+}
+
+// Test with recursive functions
+int factorial(int n) {
+  if (n <= 1) return 1;
+  return n * factorial(n - 1);
+}
+
+int test_recursive(int cond, int n) {
+  int result = __builtin_ct_select(cond, n, n + 1);
+  return factorial(result);
+}
+
+// Test with inline functions
+static inline int inline_func(int x) {
+  return x * 2;
+}
+
+int test_inline(int cond, int a, int b) {
+  return __builtin_ct_select(cond, inline_func(a), inline_func(b));
+}
+
+// Test with static variables
+int test_static_vars(int cond) {
+  static int static_a = 10;
+  static int static_b = 20;
+  
+  return __builtin_ct_select(cond, static_a, static_b);
+}
+
+// Test with extern variables
+extern int extern_a;
+extern int extern_b;
+
+int test_extern_vars(int cond) {
+  return __builtin_ct_select(cond, extern_a, extern_b);
+}
+
+// Test with register variables
+int test_register_vars(int cond) {
+  register int reg_a = 30;
+  register int reg_b = 40;
+  
+  return __builtin_ct_select(cond, reg_a, reg_b);
+}
+
+// Test with thread-local variables (C11)
+#if __STDC_VERSION__ >= 201112L
+_Thread_local int tls_a = 50;
+_Thread_local int tls_b = 60;
+
+int test_tls_vars(int cond) {
+  return __builtin_ct_select(cond, tls_a, tls_b);
+}
+#endif
+
+// Test with atomic variables (C11)
+#if __STDC_VERSION__ >= 201112L
+#include <stdatomic.h>
+atomic_int atomic_a = 70;
+atomic_int atomic_b = 80;
+
+int test_atomic_vars(int cond) {
+  return __builtin_ct_select(cond, atomic_a, atomic_b); // expected-error {{incompatible operand types ('atomic_int' (aka '_Atomic(int)') and 'atomic_int')}}
+}
+#endif 
diff --git a/clang/test/Sema/builtin-ct-select.c b/clang/test/Sema/builtin-ct-select.c
new file mode 100644
index 0000000000000..7749eb52eecb3
--- /dev/null
+++ b/clang/test/Sema/builtin-ct-select.c
@@ -0,0 +1,683 @@
+// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s
+
+// Test integer types
+int test_int(int cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_int
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+long test_long(int cond, long a, long b) {
+  // CHECK-LABEL: define {{.*}} @test_long
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+short test_short(int cond, short a, short b) {
+  // CHECK-LABEL: define {{.*}} @test_short
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i16 @llvm.ct.select.i16(i1 [[COND]], i16 %{{.*}}, i16 %{{.*}})
+  // CHECK: ret i16 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+unsigned char test_uchar(int cond, unsigned char a, unsigned char b) {
+  // CHECK-LABEL: define {{.*}} @test_uchar
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i8 @llvm.ct.select.i8(i1 [[COND]], i8 %{{.*}}, i8 %{{.*}})
+  // CHECK: ret i8 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+long long test_longlong(int cond, long long a, long long b) {
+  // CHECK-LABEL: define {{.*}} @test_longlong
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test floating point types
+float test_float(int cond, float a, float b) {
+  // CHECK-LABEL: define {{.*}} @test_float
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+double test_double(int cond, double a, double b) {
+  // CHECK-LABEL: define {{.*}} @test_double
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test pointer types
+int *test_pointer(int cond, int *a, int *b) {
+  // CHECK-LABEL: define {{.*}} @test_pointer
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}})
+  // CHECK: ret ptr [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with different condition types
+int test_char_cond(char cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_char_cond
+  // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+int test_long_cond(long cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_long_cond
+  // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with boolean condition
+int test_bool_cond(_Bool cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_bool_cond
+  // CHECK: [[COND:%.*]] = trunc i8 %{{.*}} to i1
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with constants
+int test_constant_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_constant_cond
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 true, i32 42, i32 24)
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(1, 42, 24);
+}
+
+int test_zero_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_zero_cond
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 false, i32 42, i32 24)
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(0, 42, 24);
+}
+
+// Test type promotion
+int test_promotion(int cond, short a, short b) {
+  // CHECK-LABEL: define {{.*}} @test_promotion
+  // CHECK-DAG: [[A_EXT:%.*]] = sext i16 %{{.*}} to i32
+  // CHECK-DAG: [[B_EXT:%.*]] = sext i16 %{{.*}} to i32
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[A_EXT]], i32 [[B_EXT]])
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, (int)a, (int)b);
+}
+
+// Test mixed signedness
+unsigned int test_mixed_signedness(int cond, int a, unsigned int b) {
+  // CHECK-LABEL: define {{.*}} @test_mixed_signedness
+  // CHECK-DAG: [[A_EXT:%.*]] = sext i32 %{{.*}} to i64
+  // CHECK-DAG: [[B_EXT:%.*]] = zext i32 %{{.*}} to i64
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 [[A_EXT]], i64 [[B_EXT]])
+  // CHECK: [[RESULT_TRUNC:%.*]] = trunc i64 [[RESULT]] to i32
+  // CHECK: ret i32 [[RESULT_TRUNC]]
+  return __builtin_ct_select(cond, (long)a, (long)b);
+}
+
+// Test complex expression
+int test_complex_expr_alt(int x, int y) {
+  // CHECK-LABEL: define {{.*}} @test_complex_expr_alt
+  // CHECK-DAG: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, 0
+  // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}}
+  // Separate the final sequence to ensure proper ordering
+  // CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 [[ADD]], i32 [[SUB]])
+  // CHECK-NEXT: ret i32 [[RESULT]]
+  return __builtin_ct_select(x > 0, x + y, x - y);
+}
+
+// Test nested calls
+int test_nested_structured(int cond1, int cond2, int a, int b, int c) {
+  // CHECK-LABEL: define {{.*}} @test_nested_structured
+  // Phase 1: Conditions (order doesn't matter)
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  
+  // Phase 2: Inner select (must happen before outer)
+  // CHECK: [[INNER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  
+  // Phase 3: Outer select (must use inner result)
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER]], i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c);
+}
+
+// Test with function calls
+int helper(int x) { return x * 2; }
+int test_function_calls(int cond, int x, int y) {
+  // CHECK-LABEL: define {{.*}} @test_function_calls
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[CALL1:%.*]] = call i32 @helper(i32 noundef %{{.*}})
+  // CHECK-DAG: [[CALL2:%.*]] = call i32 @helper(i32 noundef %{{.*}})
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[CALL1]], i32 [[CALL2]])
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, helper(x), helper(y));
+}
+
+// Test using ct_select as condition for another ct_select
+int test_intrinsic_condition(int cond1, int cond2, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_intrinsic_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[INNER_COND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[INNER_COND]], 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, cond2, a), b, c);
+}
+
+// Test using comparison result of ct_select as condition
+int test_comparison_condition(int cond, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_comparison_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[CMP:%.*]] = icmp sgt i32 [[FIRST_SELECT]], %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond, a, b) > c, d, a);
+}
+
+// Test using ct_select result in arithmetic as condition
+int test_arithmetic_condition(int cond, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_arithmetic_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[ADD:%.*]] = add nsw i32 [[FIRST_SELECT]], %{{.*}}
+  // CHECK: [[FINAL_COND:%.*]] = icmp ne i32 [[ADD]], 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond, a, b) + c, d, a);
+}
+
+// Test chained ct_select as conditions
+int test_chained_conditions(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e) {
+  // CHECK-LABEL: define {{.*}} @test_chained_conditions
+  // CHECK: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[SECOND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int first_select = __builtin_ct_select(cond1, a, b);
+  int second_select = __builtin_ct_select(cond2, first_select, c);
+  return __builtin_ct_select(second_select, d, e);
+}
+
+// Test using ct_select with pointer condition
+//int test_pointer_condition(int *ptr1, int *ptr2, int a, int b, int c) {
+  // NO-CHECK-LABEL: define {{.*}} @test_pointer_condition
+  // NO-CHECK: [[PTR_COND:%.*]] = icmp ne ptr %{{.*}}, null
+  // NO-CHECK: [[PTR_SELECT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[PTR_COND]], ptr %{{.*}}, ptr %{{.*}})
+  // NO-CHECK: [[FINAL_COND:%.*]] = icmp ne ptr [[PTR_SELECT]], null
+  // NO-CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // NO-CHECK: ret i32 [[RESULT]]
+//  return __builtin_ct_select(__builtin_ct_select(ptr1, ptr1, ptr2), a, b);
+//}
+
+
+// Test using ct_select result in logical operations as condition
+int test_logical_condition(int cond1, int cond2, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_logical_condition
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[SELECT_BOOL:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, a, b) && cond2, c, d);
+}
+
+// Test multiple levels of ct_select as conditions
+int test_deep_condition_nesting(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e, int f) {
+  // CHECK-LABEL: define {{.*}} @test_deep_condition_nesting
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[INNER1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[INNER1_COND:%.*]] = icmp ne i32 [[INNER1]], 0
+  // CHECK-DAG: [[INNER2:%.*]] = call i32 @llvm.ct.select.i32(i1 [[INNER1_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[OUTER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER2]], i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[OUTER]], 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, __builtin_ct_select(__builtin_ct_select(cond2, a, b), c, d), e), f, a);
+}
+
+// Test ct_select with complex condition expressions
+int test_complex_condition_expr(int x, int y, int z, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_complex_condition_expr
+  // CHECK: [[CMP1:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}}
+  // CHECK: [[SELECT1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[CMP2:%.*]] = icmp slt i32 [[SELECT1]], %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(x > y, x, y) < z, a, b);
+}
+
+// Test vector types - 128-bit vectors
+typedef int __attribute__((vector_size(16))) int4;
+typedef float __attribute__((vector_size(16))) float4;
+typedef short __attribute__((vector_size(16))) short8;
+typedef char __attribute__((vector_size(16))) char16;
+
+int4 test_vector_int4(int cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int4
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+float4 test_vector_float4(int cond, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float4
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+short8 test_vector_short8(int cond, short8 a, short8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_short8
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <8 x i16> @llvm.ct.select.v8i16(i1 [[COND]], <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: ret <8 x i16> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+char16 test_vector_char16(int cond, char16 a, char16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_char16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x i8> @llvm.ct.select.v16i8(i1 [[COND]], <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: ret <16 x i8> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test 256-bit vectors
+typedef int __attribute__((vector_size(32))) int8;
+typedef float __attribute__((vector_size(32))) float8;
+typedef double __attribute__((vector_size(32))) double4;
+
+int8 test_vector_int8(int cond, int8 a, int8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int8
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <8 x i32> @llvm.ct.select.v8i32(i1 [[COND]], <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+float8 test_vector_float8(int cond, float8 a, float8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float8
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <8 x float> @llvm.ct.select.v8f32(i1 [[COND]], <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+double4 test_vector_double4(int cond, double4 a, double4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_double4
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <4 x double> @llvm.ct.select.v4f64(i1 [[COND]], <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test 512-bit vectors
+typedef int __attribute__((vector_size(64))) int16;
+typedef float __attribute__((vector_size(64))) float16;
+
+int16 test_vector_int16(int cond, int16 a, int16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x i32> @llvm.ct.select.v16i32(i1 [[COND]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+float16 test_vector_float16(int cond, float16 a, float16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x float> @llvm.ct.select.v16f32(i1 [[COND]], <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test vector operations with different condition types
+int4 test_vector_char_cond(char cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_char_cond
+  // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+float4 test_vector_long_cond(long cond, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_long_cond
+  // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test vector constants
+int4 test_vector_constant_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_vector_constant_cond
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  int4 a = {1, 2, 3, 4};
+  int4 b = {5, 6, 7, 8};
+  return __builtin_ct_select(1, a, b);
+}
+
+float4 test_vector_zero_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_vector_zero_cond
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 false, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float4 a = {1.0f, 2.0f, 3.0f, 4.0f};
+  float4 b = {5.0f, 6.0f, 7.0f, 8.0f};
+  return __builtin_ct_select(0, a, b);
+}
+
+// Test nested vector selections
+int4 test_vector_nested(int cond1, int cond2, int4 a, int4 b, int4 c) {
+  // CHECK-LABEL: define {{.*}} @test_vector_nested
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[INNER:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND2]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND1]], <4 x i32> [[INNER]], <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c);
+}
+
+// Test vector selection with complex expressions
+float4 test_vector_complex_expr(int x, int y, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_complex_expr
+  // CHECK: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[CMP]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(x > y, a, b);
+}
+
+// Test vector with different element sizes
+typedef long long __attribute__((vector_size(16))) long2;
+typedef double __attribute__((vector_size(16))) double2;
+
+long2 test_vector_long2(int cond, long2 a, long2 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_long2
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <2 x i64> @llvm.ct.select.v2i64(i1 [[COND]], <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: ret <2 x i64> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+double2 test_vector_double2(int cond, double2 a, double2 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_double2
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <2 x double> @llvm.ct.select.v2f64(i1 [[COND]], <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: ret <2 x double> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test mixed vector operations
+int4 test_vector_from_scalar_condition(int4 vec_cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_from_scalar_condition
+  // Extract first element and use as condition
+  int scalar_cond = vec_cond[0];
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(scalar_cond, a, b);
+}
+
+// Test vector chaining
+float4 test_vector_chaining(int cond1, int cond2, int cond3, float4 a, float4 b, float4 c, float4 d) {
+  // CHECK-LABEL: define {{.*}} @test_vector_chaining
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND3:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND1]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK-DAG: [[SECOND:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND2]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK-DAG: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND3]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float4 first = __builtin_ct_select(cond1, a, b);
+  float4 second = __builtin_ct_select(cond2, first, c);
+  return __builtin_ct_select(cond3, second, d);
+}
+
+// Test special floating point values - NaN
+float test_nan_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_nan_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float  %{{.*}}, float 1.000000e+00)
+  // CHECK: ret float [[RESULT]]
+  float nan_val = __builtin_nanf("");
+  return __builtin_ct_select(cond, nan_val, 1.0f);
+}
+
+double test_nan_double_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_nan_double_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double 2.000000e+00)
+  // CHECK: ret double [[RESULT]]
+  double nan_val = __builtin_nan("");
+  return __builtin_ct_select(cond, nan_val, 2.0);
+}
+
+// Test infinity values
+float test_infinity_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_infinity_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  float pos_inf = __builtin_inff();
+  float neg_inf = -__builtin_inff();
+  return __builtin_ct_select(cond, pos_inf, neg_inf);
+}
+
+double test_infinity_double_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_infinity_double_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  double pos_inf = __builtin_inf();
+  double neg_inf = -__builtin_inf();
+  return __builtin_ct_select(cond, pos_inf, neg_inf);
+}
+
+// Test subnormal/denormal values
+float test_subnormal_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_subnormal_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  // Very small subnormal values
+  float subnormal1 = 1e-40f;
+  float subnormal2 = 1e-45f;
+  return __builtin_ct_select(cond, subnormal1, subnormal2);
+}
+
+// Test integer overflow boundaries
+int test_integer_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_integer_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int max_int = __INT_MAX__;
+  int min_int = (-__INT_MAX__ - 1);
+  return __builtin_ct_select(cond, max_int, min_int);
+}
+
+long long test_longlong_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_longlong_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  long long max_ll = __LONG_LONG_MAX__;
+  long long min_ll = (-__LONG_LONG_MAX__ - 1);
+  return __builtin_ct_select(cond, max_ll, min_ll);
+}
+
+// Test unsigned overflow boundaries
+unsigned int test_unsigned_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_unsigned_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  unsigned int max_uint = 4294967295;
+  unsigned int min_uint = 0;
+  return __builtin_ct_select(cond, max_uint, min_uint);
+}
+
+// Test null pointer dereference avoidance
+int* test_null_pointer_operands(int cond, int* valid_ptr) {
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}})
+  // CHECK: ret ptr [[RESULT]]
+  int* null_ptr = (int*)0;
+  return __builtin_ct_select(cond, null_ptr, valid_ptr);
+}
+
+// Test volatile operations
+volatile int global_volatile = 42;
+int test_volatile_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_volatile_operands
+  // CHECK-DAG: [[VOLATILE_LOAD:%.*]] = load volatile i32, ptr {{.*}}
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 100)
+  // CHECK: ret i32 [[RESULT]]
+  volatile int vol_val = global_volatile;
+  return __builtin_ct_select(cond, vol_val, 100);
+}
+
+// Test uninitialized variable behavior (should still work with ct_select)
+int test_uninitialized_operands(int cond, int initialized) {
+  // CHECK-LABEL: define {{.*}} @test_uninitialized_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int uninitialized; // Intentionally uninitialized
+  return __builtin_ct_select(cond, uninitialized, initialized);
+}
+
+// Test zero division avoidance patterns
+int test_division_by_zero_avoidance(int cond, int dividend, int divisor) {
+  // CHECK-LABEL: define {{.*}} @test_division_by_zero_avoidance
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[DIV_RESULT:%.*]] = sdiv i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SAFE_DIVISOR:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 1)
+  // First get a safe divisor (never zero)
+  int safe_divisor = __builtin_ct_select(divisor != 0, divisor, 1);
+  // Then perform division with guaranteed non-zero divisor
+  return dividend / safe_divisor;
+}
+
+// Test array bounds checking patterns
+int test_array_bounds_protection(int cond, int index, int* array) {
+  // CHECK-LABEL: define {{.*}} @test_array_bounds_protection
+  // CHECK-DAG: [[SAFE_INDEX:%.*]] = call i32 @llvm.ct.select.i32(i1 {{.*}}, i32 %{{.*}}, i32 0)
+  // Use ct_select to ensure safe array indexing
+  int safe_index = __builtin_ct_select(index >= 0 && index < 10, index, 0);
+  return array[safe_index];
+}
+
+// Test bit manipulation edge cases
+unsigned int test_bit_manipulation_edge_cases(int cond, unsigned int value) {
+  // CHECK-LABEL: define {{.*}} @test_bit_manipulation_edge_cases
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[SHIFT_LEFT:%.*]] = shl i32 %{{.*}}, 31
+  // CHECK-DAG: [[SHIFT_RIGHT:%.*]] = lshr i32 %{{.*}}, 31
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  // Test extreme bit shifts that could cause undefined behavior
+  unsigned int left_shift = value << 31;   // Could overflow
+  unsigned int right_shift = value >> 31;  // Extract sign bit
+  return __builtin_ct_select(cond, left_shift, right_shift);
+}
+
+// Test signed integer wraparound
+int test_signed_wraparound(int cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_signed_wraparound
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int sum = a + b;      // Could overflow
+  int diff = a - b;     // Could underflow
+  return __builtin_ct_select(cond, sum, diff);
+}
+
+// Test vector NaN handling
+float4 test_vector_nan_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_vector_nan_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float nan_val = __builtin_nanf("");
+  float4 nan_vec = {nan_val, nan_val, nan_val, nan_val};
+  float4 normal_vec = {1.0f, 2.0f, 3.0f, 4.0f};
+  return __builtin_ct_select(cond, nan_vec, normal_vec);
+}
+
+// Test vector infinity handling
+float4 test_vector_infinity_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_vector_infinity_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float pos_inf = __builtin_inff();
+  float neg_inf = -__builtin_inff();
+  float4 inf_vec = {pos_inf, neg_inf, pos_inf, neg_inf};
+  float4 zero_vec = {0.0f, 0.0f, 0.0f, 0.0f};
+  return __builtin_ct_select(cond, inf_vec, zero_vec);
+}
+
+// Test mixed special values
+double test_mixed_special_values(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_mixed_special_values
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  double nan_val = __builtin_nan("");
+  double inf_val = __builtin_inf();
+  return __builtin_ct_select(cond, nan_val, inf_val);
+}
+
+// Test constant-time memory access pattern
+int test_constant_time_memory_access(int secret_index, int* data_array) {
+  // CHECK-LABEL: define {{.*}} @test_constant_time_memory_access
+  // This pattern ensures constant-time memory access regardless of secret_index value
+  int result = 0;
+  // Use ct_select to accumulate values without revealing the secret index
+  for (int i = 0; i < 8; i++) {
+    int is_target = (i == secret_index);
+    int current_value = data_array[i];
+    int selected_value = __builtin_ct_select(is_target, current_value, 0);
+    result += selected_value;
+  }
+  return result;
+}
+
+// Test timing-attack resistant comparison
+int test_timing_resistant_comparison(const char* secret, const char* guess) {
+  // CHECK-LABEL: define {{.*}} @test_timing_resistant_comparison
+  // Constant-time string comparison using ct_select
+  int match = 1;
+  for (int i = 0; i < 32; i++) {
+    int chars_equal = (secret[i] == guess[i]);
+    int both_null = (secret[i] == 0) && (guess[i] == 0);
+    int still_matching = __builtin_ct_select(chars_equal || both_null, match, 0);
+    match = __builtin_ct_select(both_null, match, still_matching);
+  }
+  return match;
+}
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d4c3c51..656f6e718f029 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -783,6 +783,10 @@ enum NodeType {
   /// i1 then the high bits must conform to getBooleanContents.
   SELECT,
 
+  /// Constant-time Select, implemented with CMOV instruction. This is used to
+  /// implement constant-time select.
+  CTSELECT,
+
   /// Select with a vector condition (op #0) and two vector operands (ops #1
   /// and #2), returning a vector result.  All vectors have the same length.
   /// Much like the scalar select and setcc, each bit in the condition selects
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index df6ce0fe1b037..00d2f5bd6c8eb 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1348,6 +1348,13 @@ class SelectionDAG {
     return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags);
   }
 
+  SDValue getCTSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
+                      SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) {
+    assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
+           "Cannot use select on differing types");
+    return getNode(ISD::CTSELECT, DL, VT, Cond, LHS, RHS, Flags);
+  }
+
   /// Helper function to make it easier to build SelectCC's if you just have an
   /// ISD::CondCode instead of an SDValue.
   SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 69713d0d84011..55c62ff7e7216 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -426,6 +426,10 @@ struct SDNodeFlags {
                             NonNeg | NoNaNs | NoInfs | SameSign,
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
+
+    // Instructs DAGCombiner to skip optimization passes for this node.
+    // Preserves the operation as-is without folding, merging, or elimination.
+    NoMerge = 1 << 15,
   };
 
   /// Default constructor turns off all optimization flags.
@@ -458,6 +462,7 @@ struct SDNodeFlags {
   void setAllowReassociation(bool b) { setFlag<AllowReassociation>(b); }
   void setNoFPExcept(bool b) { setFlag<NoFPExcept>(b); }
   void setUnpredictable(bool b) { setFlag<Unpredictable>(b); }
+  void setNoMerge(bool b) { setFlag<NoMerge>(b); }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; }
@@ -475,6 +480,7 @@ struct SDNodeFlags {
   bool hasAllowReassociation() const { return Flags & AllowReassociation; }
   bool hasNoFPExcept() const { return Flags & NoFPExcept; }
   bool hasUnpredictable() const { return Flags & Unpredictable; }
+  bool hasNoMerge() const { return Flags & NoMerge; }
 
   bool operator==(const SDNodeFlags &Other) const {
     return Flags == Other.Flags;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..375a4bf4c5c03 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -242,11 +242,15 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Enum that describes what type of support for selects the target has.
   enum SelectSupportKind {
-    ScalarValSelect,      // The target supports scalar selects (ex: cmov).
-    ScalarCondVectorVal,  // The target supports selects with a scalar condition
-                          // and vector values (ex: cmov).
-    VectorMaskSelect      // The target supports vector selects with a vector
-                          // mask (ex: x86 blends).
+    ScalarValSelect,     // The target supports scalar selects (ex: cmov).
+    ScalarCondVectorVal, // The target supports selects with a scalar condition
+                         // and vector values (ex: cmov).
+    VectorMaskSelect,    // The target supports vector selects with a vector
+                         // mask (ex: x86 blends).
+    CtSelect,            // The target implements a custom constant-time select.
+    ScalarCondVectorValCtSelect, // The target supports selects with a scalar
+                                 // condition and vector values.
+    VectorMaskValCtSelect, // The target supports vector selects with a vector
   };
 
   /// Enum that specifies what an atomic load/AtomicRMWInst is expanded
@@ -476,8 +480,8 @@ class LLVM_ABI TargetLoweringBase {
   MachineMemOperand::Flags
   getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const;
 
-  virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
-    return true;
+  virtual bool isSelectSupported(SelectSupportKind kind) const {
+    return kind != CtSelect;
   }
 
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8856eda250ed6..32f8fce3f05d9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1813,6 +1813,13 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic<
     [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
      NoCapture<ArgIndex<0>>]>;
 
+///===-------------------------- Constant Time Intrinsics --------------------------===//
+//
+// Intrinsic to support constant time select
+def int_ct_select : DefaultAttrsIntrinsic<[llvm_any_ty],
+    [llvm_i1_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrWillReturn, IntrNoDuplicate, NoUndef<RetIndex>]>;
+
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
 // TODO: We should introduce a new memory kind fo traps (and other side effects
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..c783a2aa9258f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -214,6 +214,10 @@ def SDTSelect : SDTypeProfile<1, 3, [       // select
   SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
 ]>;
 
+def SDTCtSelect : SDTypeProfile<1, 3, [       // ctselect
+  SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
+]>;
+
 def SDTVSelect : SDTypeProfile<1, 3, [       // vselect
   SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>
 ]>;
@@ -717,6 +721,7 @@ def reset_fpmode   : SDNode<"ISD::RESET_FPMODE", SDTNone, [SDNPHasChain]>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
+def ctselect   : SDNode<"ISD::CTSELECT"   , SDTCtSelect>;
 def vselect    : SDNode<"ISD::VSELECT"    , SDTVSelect>;
 def selectcc   : SDNode<"ISD::SELECT_CC"  , SDTSelectCC>;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..06167fb7c79d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -484,6 +484,7 @@ namespace {
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
+    SDValue visitCTSELECT(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
     SDValue visitVP_SELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
@@ -1898,6 +1899,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 }
 
 SDValue DAGCombiner::visit(SDNode *N) {
+  if (N->getFlags().hasNoMerge())
+    return SDValue();
+
   // clang-format off
   switch (N->getOpcode()) {
   default: break;
@@ -1968,6 +1972,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
+  case ISD::CTSELECT:           return visitCTSELECT(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
@@ -6016,6 +6021,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
     N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
     break;
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VSELECT:
     if (N0.getOperand(0).getOpcode() != ISD::SETCC)
       return SDValue();
@@ -12168,8 +12174,9 @@ template <class MatchContextClass>
 static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
                                      SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT ||
-          N->getOpcode() == ISD::VP_SELECT) &&
-         "Expected a (v)(vp.)select");
+          N->getOpcode() == ISD::VP_SELECT ||
+          N->getOpcode() == ISD::CTSELECT) &&
+         "Expected a (v)(vp.)(ct) select");
   SDValue Cond = N->getOperand(0);
   SDValue T = N->getOperand(1), F = N->getOperand(2);
   EVT VT = N->getValueType(0);
@@ -12531,6 +12538,109 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitCTSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+  SDLoc DL(N);
+  SDNodeFlags Flags = N->getFlags();
+
+  if (SDValue V = foldBoolSelectToLogic<EmptyMatchContext>(N, DL, DAG))
+    return V;
+
+  // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
+    SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1);
+    SelectOp->setFlags(Flags);
+    return SelectOp;
+  }
+
+  if (VT0 == MVT::i1) {
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its preferred form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence =
+        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // ctselect (and Cond0, Cond1), X, Y
+    //   -> ctselect Cond0, (ctselect Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
+                                        Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0,
+                           InnerSelect, N2, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+    // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1,
+    // X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
+                                        Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1,
+                           InnerSelect, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+
+    // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1),
+    // X, Y
+    if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) {
+      SDValue N1_0 = N1->getOperand(0);
+      SDValue N1_1 = N1->getOperand(1);
+      SDValue N1_2 = N1->getOperand(2);
+      if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
+        // Create the actual and node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1,
+                             N2, Flags);
+        }
+        // Otherwise see if we can optimize the "and" to a better pattern.
+        if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined,
+                             N1_1, N2, Flags);
+        }
+      }
+    }
+    // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1),
+    // X, Y
+    if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) {
+      SDValue N2_0 = N2->getOperand(0);
+      SDValue N2_1 = N2->getOperand(1);
+      SDValue N2_2 = N2->getOperand(2);
+      if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
+        // Create the actual or node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2,
+                             Flags);
+        }
+        // Otherwise see if we can optimize to a better pattern.
+        if (SDValue Combined = visitORLike(N0, N2_0, DL))
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1,
+                             N2_2, Flags);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // This function assumes all the vselect's arguments are CONCAT_VECTOR
 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5fb7e63cfb605..54d51aaa15442 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4135,6 +4135,40 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     Results.push_back(Tmp1);
     break;
+  case ISD::CTSELECT: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    EVT VT = Tmp2.getValueType();
+    if (VT.isVector()) {
+      SmallVector<SDValue> Elements;
+      unsigned NumElements = VT.getVectorNumElements();
+      EVT ScalarVT = VT.getScalarType();
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64);
+        SDValue TVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal);
+        SDValue FVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal);
+        Elements.push_back(DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags()));
+      }
+      Tmp1 = DAG.getBuildVector(VT, dl, Elements);
+    } else if (VT.isFloatingPoint()) {
+      EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+      Tmp2 = DAG.getBitcast(IntegerVT, Tmp2);
+      Tmp3 = DAG.getBitcast(IntegerVT, Tmp3);
+      Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3, Node->getFlags()));
+    } else {
+      assert(VT.isInteger());
+      EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+      auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT);
+      auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT);
+      SDValue ResLo = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags());
+      SDValue ResHi = DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags());
+      Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi);
+      Tmp1->setFlags(Node->getFlags());
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
   case ISD::BR_JT: {
     SDValue Chain = Node->getOperand(0);
     SDValue Table = Node->getOperand(1);
@@ -5473,7 +5507,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
     break;
   }
-  case ISD::SELECT: {
+  case ISD::SELECT:
+  case ISD::CTSELECT: {
     unsigned ExtOp, TruncOp;
     if (Node->getValueType(0).isVector() ||
         Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
@@ -5491,7 +5526,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
     // Perform the larger operation, then round down.
-    Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1->setFlags(Node->getFlags());
     if (TruncOp != ISD::FP_ROUND)
       Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
     else
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 437d0f4654096..61251e58046d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -159,6 +159,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::CTSELECT:    R = SoftenFloatRes_CTSELECT(N); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
     case ISD::FREEZE:      R = SoftenFloatRes_FREEZE(N); break;
     case ISD::STRICT_SINT_TO_FP:
@@ -1041,6 +1042,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_CTSELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
+                         RHS);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
@@ -1541,6 +1549,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:     SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
@@ -2897,6 +2906,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       R = PromoteFloatRes_ATOMIC_LOAD(N);
       break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
+    case ISD::CTSELECT:
+      R = PromoteFloatRes_SELECT(N);
+      break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
     case ISD::SINT_TO_FP:
@@ -3199,7 +3211,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
   SDValue FalseVal = GetPromotedFloat(N->getOperand(2));
 
-  return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0),
+  return DAG.getNode(N->getOpcode(), SDLoc(N), TrueVal->getValueType(0),
                      N->getOperand(0), TrueVal, FalseVal);
 }
 
@@ -3383,6 +3395,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     R = SoftPromoteHalfRes_ATOMIC_LOAD(N);
     break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
+  case ISD::CTSELECT:
+    R = SoftPromoteHalfRes_SELECT(N);
+    break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::STRICT_SINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 88a4a8b16373b..124f61df9679b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -95,6 +95,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_VECTOR_COMPRESS(N);
     break;
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
@@ -2000,6 +2001,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::CTSELECT:
+    Res = PromoteIntOp_CTSELECT(N, OpNo);
+    break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
   case ISD::VP_SETCC:
   case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
@@ -2377,6 +2381,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
                                         N->getOperand(2)), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT OpVT = N->getOpcode() == ISD::CTSELECT ? OpTy.getScalarType() : OpTy;
+  Cond = PromoteTargetBoolean(Cond, OpVT);
+
+  return SDValue(
+      DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Don't know how to promote this operand!");
 
@@ -2978,6 +2995,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:
+    SplitRes_Select(N, Lo, Hi);
+    break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 603dc34ce72a7..f76520ad07508 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -401,6 +401,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_ScalarOp(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
@@ -633,6 +634,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_LOAD(SDNode *N);
   SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_CTSELECT(SDNode *N);
   SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
@@ -893,6 +895,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_CTSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
@@ -1221,7 +1224,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
                              SDValue &Lo, SDValue &Hi);
   void SplitVecRes_AssertZext  (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitRes_Select      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 88c1af20a321e..098368ef2f6b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -570,6 +570,20 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue LL, LH, RL, RH, CL, CH;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  CL = CH = Cond;
+  assert(!Cond.getValueType().isVector() && "Unsupported vector type");
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+}
+
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue LL, LH, RL, RH;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3b5f83f7c089a..4ecc12c1f0e31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -74,6 +74,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
   case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::CTSELECT:
+    R = ScalarizeVecRes_CTSELECT(N);
+    break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::POISON:
@@ -655,6 +658,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
                        GetScalarizedVector(N->getOperand(2)));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CTSELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
+                         GetScalarizedVector(N->getOperand(2)));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(2));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
@@ -1189,6 +1198,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SELECT:
   case ISD::VP_MERGE:
   case ISD::VP_SELECT:    SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:
+    SplitRes_CTSELECT(N, Lo, Hi);
+    break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
@@ -4854,6 +4866,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
     Res = WidenVecRes_Select(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 90edaf3ef5471..8e14aa1f869a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8249,6 +8249,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return V;
     break;
   }
+
   case ISD::SELECT:
   case ISD::VSELECT:
     if (SDValue V = simplifySelect(N1, N2, N3))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index cb0038c54f8c7..90d9ac76b6e57 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6489,6 +6489,172 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
   setValue(&I, Result);
 }
 
+/// Fallback implementation is an alternative approach for managing
+/// architectures that don't have native support for Constant-Time Select. This
+/// function uses DAG Chaining
+SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackChain(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F,
+    EVT VT) {
+
+  SDValue WorkingT = T;
+  SDValue WorkingF = F;
+  EVT WorkingVT = VT;
+
+  SDValue Chain = DAG.getEntryNode();
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
+
+  if (VT.isVector() && !Cond.getValueType().isVector()) {
+    ElementCount NumElems = VT.getVectorElementCount();
+    EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems);
+
+    if (VT.isScalableVector()) {
+      Cond = DAG.getSplatVector(CondVT, DL, Cond);
+    } else {
+      Cond = DAG.getSplatBuildVector(CondVT, DL, Cond);
+    }
+  }
+
+  if (VT.isFloatingPoint()) {
+    if (VT.isVector()) {
+      // float vector -> int vector
+      EVT ElemVT = VT.getVectorElementType();
+      unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits();
+      EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth);
+
+      WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT,
+                                   VT.getVectorElementCount());
+    } else {
+      WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    }
+
+    WorkingT = DAG.getBitcast(WorkingVT, T);
+    WorkingF = DAG.getBitcast(WorkingVT, F);
+  }
+
+  SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT);
+
+  SDValue AllOnes;
+  if (WorkingVT.isScalableVector()) {
+    unsigned BitWidth = WorkingVT.getScalarSizeInBits();
+    APInt AllOnesVal = APInt::getAllOnes(BitWidth);
+    SDValue ScalarAllOnes =
+        DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType());
+    AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes);
+  } else {
+    AllOnes = DAG.getAllOnesConstant(DL, WorkingVT);
+  }
+
+  SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes);
+
+  // (or (and WorkingT, Mask), (and F, ~Mask))
+  SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT);
+
+  bool CanUseChaining = false;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (CanUseChaining) {
+    // Apply chaining through registers for additional protection
+    
+    const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT());
+    Register TMReg = MRI.createVirtualRegister(RC);
+    Chain = DAG.getCopyToReg(Chain, DL, TMReg, TM);
+    TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT);
+  }
+
+  SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF);
+
+  if (!WorkingVT.isScalableVector()) {
+    // For fixed-size vectors and scalars, we can safely use register classes
+    CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT());
+  } else {
+    // For scalable vectors, check if the target has register class support
+    // This is target-specific - RISC-V might not support this directly
+    CanUseChaining = false;  // Conservative: disable for scalable vectors
+  }
+
+
+  SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM);
+
+  // Convert back if needed
+  if (WorkingVT != VT) {
+    Result = DAG.getBitcast(VT, Result);
+  }
+
+  return Result;
+}
+
+/// Fallback implementation is an alternative approach for managing
+/// architectures that don't have native support for Constant-Time Select. This
+/// function uses the NoMerge flag
+SDValue SelectionDAGBuilder::createProtectedCtSelectFallbackNoMerge(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F,
+    EVT VT) {
+  SDNodeFlags ProtectedFlag;
+  ProtectedFlag.setNoMerge(true);
+
+  SDValue WorkingT = T;
+  SDValue WorkingF = F;
+  EVT WorkingVT = VT;
+
+  if (VT.isVector() && !Cond.getValueType().isVector()) {
+    ElementCount NumElems = VT.getVectorElementCount();
+    EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems);
+
+    if (VT.isScalableVector()) {
+      Cond = DAG.getSplatVector(CondVT, DL, Cond);
+    } else {
+      Cond = DAG.getSplatBuildVector(CondVT, DL, Cond);
+    }
+  }
+
+  if (VT.isFloatingPoint()) {
+    if (VT.isVector()) {
+      // float vector -> int vector
+      EVT ElemVT = VT.getVectorElementType();
+      unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits();
+      EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth);
+
+      WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT,
+                                   VT.getVectorElementCount());
+    } else {
+      WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    }
+
+    WorkingT = DAG.getBitcast(WorkingVT, T);
+    WorkingF = DAG.getBitcast(WorkingVT, F);
+  }
+
+  SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT);
+
+  SDValue AllOnes;
+  if (WorkingVT.isScalableVector()) {
+    unsigned BitWidth = WorkingVT.getScalarSizeInBits();
+    APInt AllOnesVal = APInt::getAllOnes(BitWidth);
+    SDValue ScalarAllOnes =
+        DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType());
+    AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes);
+  } else {
+    AllOnes = DAG.getAllOnesConstant(DL, WorkingVT);
+  }
+
+  SDValue Invert =
+      DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes, ProtectedFlag);
+
+  // (or (and WorkingT, Mask), (and F, ~Mask))
+  SDValue TM =
+      DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT, ProtectedFlag);
+  SDValue FM =
+      DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF, ProtectedFlag);
+  SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM, ProtectedFlag);
+
+  // Convert back if needed
+  if (WorkingVT != VT) {
+    Result = DAG.getBitcast(VT, Result);
+  }
+
+  return Result;
+}
+
 /// Lower the call to the specified intrinsic function.
 void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                              unsigned Intrinsic) {
@@ -6667,6 +6833,53 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     updateDAGForMaybeTailCall(MC);
     return;
   }
+  case Intrinsic::ct_select: {
+    SDLoc DL = getCurSDLoc();
+
+    SDValue Cond = getValue(I.getArgOperand(0)); // i1
+    SDValue A = getValue(I.getArgOperand(1));    // T
+    SDValue B = getValue(I.getArgOperand(2));    // T
+
+    assert((A.getValueType() == B.getValueType()) &&
+           "Operands are of different types");
+
+    EVT VT = A.getValueType();
+    EVT CondVT = Cond.getValueType();
+
+    // For now we'll only support scalar predicates
+    // assert if Cond type is Vector
+    // TODO: Maybe look into supporting vector predicates?
+    if (CondVT.isVector()) {
+      report_fatal_error(
+          "llvm.ct.select: predicates with vector types not supported yet");
+    }
+
+    // Set function attribute to indicate ct.select usage
+    Function &F = DAG.getMachineFunction().getFunction();
+    F.addFnAttr("ct-select");
+
+    // Handle scalar types
+    if (TLI.isSelectSupported(
+            TargetLoweringBase::SelectSupportKind::CtSelect) &&
+        !CondVT.isVector()) {
+      SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B);
+      setValue(&I, Result);
+      return;
+    }
+
+    // We don't support non-integral pointers
+    Type *CurrType = VT.getTypeForEVT(*Context);
+    if (CurrType->isPointerTy()) {
+      unsigned AS = CurrType->getPointerAddressSpace();
+      if (DAG.getDataLayout().isNonIntegralAddressSpace(AS)) {
+        report_fatal_error(
+            "llvm.ct.select: non-integral pointers are not supported");
+      }
+    }
+
+    setValue(&I, createProtectedCtSelectFallbackChain(DAG, DL, Cond, A, B, VT));
+    return;
+  }
   case Intrinsic::call_preallocated_setup: {
     const CallBase *PreallocatedCall = FindPreallocatedCall(&I);
     SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c7577fa335feb..6068818a32656 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -214,6 +214,12 @@ class SelectionDAGBuilder {
   peelDominantCaseCluster(const SwitchInst &SI,
                           SwitchCG::CaseClusterVector &Clusters,
                           BranchProbability &PeeledCaseProb);
+  SDValue createProtectedCtSelectFallbackChain(SelectionDAG &DAG,
+                                               const SDLoc &DL, SDValue Cond,
+                                               SDValue T, SDValue F, EVT VT);
+  SDValue createProtectedCtSelectFallbackNoMerge(SelectionDAG &DAG,
+                                                 const SDLoc &DL, SDValue Cond,
+                                                 SDValue T, SDValue F, EVT VT);
 
 private:
   const TargetMachine &TM;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 39cbfad6d0be1..274a1cd4f7594 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -332,6 +332,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FPOWI:                      return "fpowi";
   case ISD::STRICT_FPOWI:               return "strict_fpowi";
   case ISD::SETCC:                      return "setcc";
+  case ISD::CTSELECT:                   return "ctselect";
   case ISD::SETCCCARRY:                 return "setcccarry";
   case ISD::STRICT_FSETCC:              return "strict_fsetcc";
   case ISD::STRICT_FSETCCS:             return "strict_fsetccs";
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b7a60a8..89e949d96146e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -511,12 +511,35 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::i8,  Promote);
+  setOperationAction(ISD::CTSELECT, MVT::i16, Promote);
+  setOperationAction(ISD::CTSELECT, MVT::i32, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::i64, Custom);
   if (Subtarget->hasFPARMv8()) {
     setOperationAction(ISD::SELECT, MVT::f16, Custom);
     setOperationAction(ISD::SELECT, MVT::bf16, Custom);
   }
+  if (Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::CTSELECT, MVT::f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::bf16, Custom);
+  } else {
+    setOperationAction(ISD::CTSELECT, MVT::f16, Promote);
+    setOperationAction(ISD::CTSELECT, MVT::bf16, Promote);
+  }
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::f32, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::f64, Custom);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    MVT elemType = VT.getVectorElementType();
+    if (elemType == MVT::i8 || elemType == MVT::i16) {
+      setOperationAction(ISD::CTSELECT, VT, Promote);
+    } else if ((elemType == MVT::f16 || elemType == MVT::bf16) && !Subtarget->hasFullFP16()) {
+      setOperationAction(ISD::CTSELECT, VT, Promote);
+    } else {
+      setOperationAction(ISD::CTSELECT, VT, Expand);
+    }
+  }
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
@@ -3328,6 +3351,18 @@ void AArch64TargetLowering::fixupPtrauthDiscriminator(
   IntDiscOp.setImm(IntDisc);
 }
 
+MachineBasicBlock *AArch64TargetLowering::EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder Builder = BuildMI(*MBB, MI, DL, TII->get(Opcode));
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    Builder.add(MI.getOperand(Idx));
+  }
+  Builder->setFlag(MachineInstr::NoMerge);
+  MBB->remove_instr(&MI);
+  return MBB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -7590,6 +7625,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
+  case ISD::CTSELECT:
+    return LowerCTSELECT(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
@@ -12146,6 +12183,22 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   return Res;
 }
 
+SDValue AArch64TargetLowering::LowerCTSELECT(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue CCVal = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  EVT VT = Op.getValueType();
+
+  SDValue Zero = DAG.getConstant(0, DL, CCVal.getValueType());
+  SDValue CC;
+  SDValue Cmp = getAArch64Cmp(CCVal, Zero, ISD::SETNE, CC, DAG, DL);
+
+  return DAG.getNode(AArch64ISD::CTSELECT, DL, VT, TVal, FVal, CC, Cmp);
+}
+
 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
                                               SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9ffc47aa..415360ea57adf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -23,6 +23,11 @@
 
 namespace llvm {
 
+namespace AArch64ISD {
+// Forward declare the enum from the generated file
+enum GenNodeType : unsigned;
+} // namespace AArch64ISD
+
 class AArch64TargetMachine;
 
 namespace AArch64 {
@@ -202,6 +207,8 @@ class AArch64TargetLowering : public TargetLowering {
                                  MachineOperand &AddrDiscOp,
                                  const TargetRegisterClass *AddrDiscRC) const;
 
+  MachineBasicBlock *EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -685,6 +692,7 @@ class AArch64TargetLowering : public TargetLowering {
                          iterator_range<SDNode::user_iterator> Users,
                          SDNodeFlags Flags, const SDLoc &dl,
                          SelectionDAG &DAG) const;
+  SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
@@ -920,6 +928,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool hasMultipleConditionRegisters(EVT VT) const override {
     return VT.isScalableVector();
   }
+
+  bool isSelectSupported(SelectSupportKind Kind) const override {
+    return true;
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 12c600f0f2661..7b3fbc64ada36 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2113,16 +2113,46 @@ bool AArch64InstrInfo::removeCmpToZeroOrOne(
   return true;
 }
 
-bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
-      MI.getOpcode() != AArch64::CATCHRET)
-    return false;
+static inline void expandCtSelect(MachineBasicBlock &MBB, MachineInstr &MI, DebugLoc &DL, const MCInstrDesc &MCID) {
+  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, MCID);
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    Builder.add(MI.getOperand(Idx));
+  }
+  Builder->setFlag(MachineInstr::NoMerge);
+  MBB.remove_instr(&MI);
+}
 
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
   auto TRI = Subtarget.getRegisterInfo();
   DebugLoc DL = MI.getDebugLoc();
 
+  switch (MI.getOpcode()) {
+    case AArch64::I32CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::CSELWr));
+      return true;
+    case AArch64::I64CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::CSELXr));
+      return true;
+    case AArch64::BF16CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr));
+      return true;
+    case AArch64::F16CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr));
+      return true;
+    case AArch64::F32CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::FCSELSrrr));
+      return true;
+    case AArch64::F64CTSELECT:
+      expandCtSelect(MBB, MI, DL, get(AArch64::FCSELDrrr));
+      return true;
+  }
+
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
+      MI.getOpcode() != AArch64::CATCHRET)
+    return false;
+
   if (MI.getOpcode() == AArch64::CATCHRET) {
     // Skip to the first instruction before the epilog.
     const TargetInstrInfo *TII =
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c7510f80c..64de1674b494d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -464,6 +464,11 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
 def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
                                         SDTCisVT<2, OtherVT>]>;
 
+def SDT_AArch64CtSelect : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
 def SDT_AArch64CSel  : SDTypeProfile<1, 4,
                                    [SDTCisSameAs<0, 1>,
                                     SDTCisSameAs<0, 2>,
@@ -831,6 +836,7 @@ def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
 def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
                                 [SDNPHasChain]>;
 
+def AArch64ctselect      : SDNode<"AArch64ISD::CTSELECT", SDT_AArch64CtSelect>;
 
 def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
 // Conditional select invert.
@@ -5683,6 +5689,45 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
   let hasNoSchedulingInfo = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Constant-time conditional selection instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1, Uses = [NZCV] in {
+  def I32CTSELECT : Pseudo<(outs GPR32:$dst),
+                           (ins GPR32:$tval, GPR32:$fval, i32imm:$cc),
+                           [(set (i32 GPR32:$dst),
+                                 (AArch64ctselect GPR32:$tval, GPR32:$fval,
+                                                (i32 imm:$cc), NZCV))]>;
+  def I64CTSELECT : Pseudo<(outs GPR64:$dst),
+                           (ins GPR64:$tval, GPR64:$fval, i32imm:$cc),
+                           [(set (i64 GPR64:$dst),
+                                 (AArch64ctselect GPR64:$tval, GPR64:$fval,
+                                                (i32 imm:$cc), NZCV))]>;
+  let Predicates = [HasFullFP16] in {
+    def F16CTSELECT : Pseudo<(outs FPR16:$dst),
+                            (ins FPR16:$tval, FPR16:$fval, i32imm:$cc),
+                            [(set (f16 FPR16:$dst),
+                                  (AArch64ctselect (f16 FPR16:$tval), (f16 FPR16:$fval),
+                                                  (i32 imm:$cc), NZCV))]>;
+    def BF16CTSELECT : Pseudo<(outs FPR16:$dst),
+                            (ins FPR16:$tval, FPR16:$fval, i32imm:$cc),
+                            [(set (bf16 FPR16:$dst),
+                                  (AArch64ctselect (bf16 FPR16:$tval), (bf16 FPR16:$fval),
+                                                  (i32 imm:$cc), NZCV))]>;
+  }
+  def F32CTSELECT : Pseudo<(outs FPR32:$dst),
+                           (ins FPR32:$tval, FPR32:$fval, i32imm:$cc),
+                           [(set (f32 FPR32:$dst),
+                                 (AArch64ctselect FPR32:$tval, FPR32:$fval,
+                                                (i32 imm:$cc), NZCV))]>;
+  def F64CTSELECT : Pseudo<(outs FPR64:$dst),
+                           (ins FPR64:$tval, FPR64:$fval, i32imm:$cc),
+                           [(set (f64 FPR64:$dst),
+                                 (AArch64ctselect FPR64:$tval, FPR64:$fval,
+                                                (i32 imm:$cc), NZCV))]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions used for emitting unwind opcodes on ARM64 Windows.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 39946633603f6..e2ec9118eb5ee 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -393,5 +393,23 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     OutMI.setOpcode(AArch64::RET);
     OutMI.addOperand(MCOperand::createReg(AArch64::LR));
     break;
+  case AArch64::I32CTSELECT:
+    OutMI.setOpcode(AArch64::CSELWr);
+    break;
+  case AArch64::I64CTSELECT:
+    OutMI.setOpcode(AArch64::CSELXr);
+    break;
+  case AArch64::BF16CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELHrrr);
+    break;
+  case AArch64::F16CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELHrrr);
+    break;
+  case AArch64::F32CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELSrrr);
+    break;
+  case AArch64::F64CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELDrrr);
+    break;
   }
 }
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 22769dbf38719..fa10c00526cf7 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1526,18 +1526,340 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   BB->erase(MI);
 }
 
+// Expands the ctselect pseudo for vector operands, post-RA.
+bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+
+  // These operations will differ by operand register size.
+  unsigned AndOp = ARM::VANDd;
+  unsigned BicOp = ARM::VBICd;
+  unsigned OrrOp = ARM::VORRd;
+  unsigned BroadcastOp = ARM::VDUP32d;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg);
+
+  if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+    AndOp = ARM::VANDq;
+    BicOp = ARM::VBICq;
+    OrrOp = ARM::VORRq;
+    BroadcastOp = ARM::VDUP32q;
+  }
+
+  unsigned RsbOp = Subtarget.isThumb2() ? ARM::t2RSBri : ARM::RSBri;
+
+  // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1, $src2, $cond))
+  Register VectorMaskReg = MI.getOperand(2).getReg(); 
+  Register Src1Reg = MI.getOperand(3).getReg();
+  Register Src2Reg = MI.getOperand(4).getReg();
+  Register CondReg = MI.getOperand(5).getReg();
+
+  // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask)
+
+  // 1. mask = 0 - cond
+  // When cond = 0: mask = 0x00000000.
+  // When cond = 1: mask = 0xFFFFFFFF.
+
+  MachineInstr *FirstNewMI =
+    BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg)
+    .addReg(CondReg)
+    .addImm(0)
+    .add(predOps(ARMCC::AL))
+    .add(condCodeOp())
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  
+  // 2. A = src1 & mask
+  // For vectors, broadcast the scalar mask so it matches operand size.
+  BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg)
+    .addReg(MaskReg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(AndOp), DestReg)
+    .addReg(Src1Reg)
+    .addReg(VectorMaskReg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. B = src2 & ~mask
+  BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg)
+    .addReg(Src2Reg)
+    .addReg(VectorMaskReg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = A | B
+  auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg)
+    .addReg(DestReg)
+    .addReg(VectorMaskReg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  auto BundleStart = FirstNewMI->getIterator();
+  auto BundleEnd = LastNewMI->getIterator();
+
+  // Add instruction bundling
+  finalizeBundle(*MBB, BundleStart, std::next(BundleEnd));
+  
+  MI.eraseFromParent();
+  return true;
+}
+
+// Expands the ctselect pseudo for thumb1, post-RA.
+bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // pseudos in thumb1 mode have: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond))
+  // register class here is always tGPR.
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+  Register Src1Reg = MI.getOperand(2).getReg();
+  Register Src2Reg = MI.getOperand(3).getReg();
+  Register CondReg = MI.getOperand(4).getReg();
+
+  // Access register info
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned RegSize = TRI->getRegSizeInBits(MaskReg, MRI);
+  unsigned ShiftAmount = RegSize - 1;
+
+  // Option 1: Shift-based mask (preferred - no flag modification)
+  MachineInstr *FirstNewMI =
+    BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg)
+    .addReg(CondReg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Instead of using RSB, we can use LSL and ASR to get the mask. This is to avoid the flag modification caused by RSB.
+  BuildMI(*MBB, MI, DL, get(ARM::tLSLri), MaskReg)
+    .addReg(MaskReg)
+    .addImm(ShiftAmount)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(ARM::tASRri), MaskReg)
+    .addReg(MaskReg)
+    .addImm(ShiftAmount)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 2. xor_diff = src1 ^ src2
+  BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg)
+    .addReg(Src1Reg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg)
+    .addReg(DestReg)
+    .addReg(Src2Reg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. masked_xor = xor_diff & mask
+  BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg)
+    .addReg(DestReg)
+    .addReg(MaskReg, RegState::Kill)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = src2 ^ masked_xor
+  auto LastMI = BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg)
+    .addReg(DestReg)
+    .addReg(Src2Reg)
+    .add(predOps(ARMCC::AL))
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Add instruction bundling
+  auto BundleStart = FirstNewMI->getIterator();
+  finalizeBundle(*MBB, BundleStart, std::next(LastMI->getIterator()));
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// Expands the ctselect pseudo, post-RA.
+bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+  Register DestRegSavedRef = DestReg;
+  Register Src1Reg, Src2Reg, CondReg;
+
+  // These operations will differ by operand register size.
+  unsigned RsbOp = ARM::RSBri;
+  unsigned AndOp = ARM::ANDrr;
+  unsigned BicOp = ARM::BICrr;
+  unsigned OrrOp = ARM::ORRrr;
+
+  if (Subtarget.isThumb2()) {
+    RsbOp = ARM::t2RSBri;
+    AndOp = ARM::t2ANDrr;
+    BicOp = ARM::t2BICrr;
+    OrrOp = ARM::t2ORRrr;
+  }
+
+  unsigned Opcode = MI.getOpcode();
+  bool IsFloat = Opcode == ARM::CTSELECTf32 || Opcode == ARM::CTSELECTf16 || Opcode == ARM::CTSELECTbf16;
+  MachineInstr *FirstNewMI = nullptr;
+  if (IsFloat) {
+    // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins $src1, $src2, $cond))
+    // We use two scratch registers in tablegen for bitwise ops on float types,.
+     Register GPRScratch1 = MI.getOperand(2).getReg();
+     Register GPRScratch2 = MI.getOperand(3).getReg();
+     
+     // choice a from __builtin_ct_select(cond, a, b)
+     Src1Reg = MI.getOperand(4).getReg();
+     // choice b from __builtin_ct_select(cond, a, b)
+     Src2Reg = MI.getOperand(5).getReg();
+     // cond from __builtin_ct_select(cond, a, b)
+     CondReg = MI.getOperand(6).getReg();
+
+    // Move fp src1 to GPR scratch1 so we can do our bitwise ops
+    FirstNewMI = BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1)
+        .addReg(Src1Reg)
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::MIFlag::NoMerge);
+      
+    // Move src2 to scratch2
+    BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch2)
+      .addReg(Src2Reg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+    
+    Src1Reg = GPRScratch1;
+    Src2Reg = GPRScratch2;
+    // Reuse GPRScratch1 for dest after we are done working with src1.
+    DestReg = GPRScratch1;
+  } else {
+    // Any non-float, non-vector pseudo has: (outs $dst, $tmp_mask), (ins $src1, $src2, $cond))
+    Src1Reg = MI.getOperand(2).getReg();
+    Src2Reg = MI.getOperand(3).getReg();
+    CondReg = MI.getOperand(4).getReg();
+  }
+
+  // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask)
+
+  // 1. mask = 0 - cond
+  // When cond = 0: mask = 0x00000000.
+  // When cond = 1: mask = 0xFFFFFFFF.
+  auto TmpNewMI = BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg)
+      .addReg(CondReg)
+      .addImm(0)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp())
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // We use the first instruction in the bundle as the first instruction.
+  if (!FirstNewMI)
+    FirstNewMI = TmpNewMI;
+
+  // 2. A = src1 & mask
+  BuildMI(*MBB, MI, DL, get(AndOp), DestReg)
+    .addReg(Src1Reg)
+    .addReg(MaskReg)
+    .add(predOps(ARMCC::AL))
+    .add(condCodeOp())
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. B = src2 & ~mask
+  BuildMI(*MBB, MI, DL, get(BicOp), MaskReg)
+    .addReg(Src2Reg)
+    .addReg(MaskReg)
+    .add(predOps(ARMCC::AL))
+    .add(condCodeOp())
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = A | B
+  auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg)
+    .addReg(DestReg)
+    .addReg(MaskReg)
+    .add(predOps(ARMCC::AL))
+    .add(condCodeOp())
+    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  if (IsFloat) {
+    // Return our result from GPR to the correct register type.
+    LastNewMI =BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef)
+      .addReg(DestReg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  }
+
+  auto BundleStart = FirstNewMI->getIterator();
+  auto BundleEnd = LastNewMI->getIterator();
+
+  // Add instruction bundling
+  finalizeBundle(*MBB, BundleStart, std::next(BundleEnd));
+  
+  MI.eraseFromParent();
+  return true;
+}
+
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+  auto opcode = MI.getOpcode();
+
+  if (opcode == TargetOpcode::LOAD_STACK_GUARD) {
     expandLoadStackGuard(MI);
     MI.getParent()->erase(MI);
     return true;
   }
 
-  if (MI.getOpcode() == ARM::MEMCPY) {
+  if (opcode == ARM::MEMCPY) {
     expandMEMCPY(MI);
     return true;
   }
 
+  if (opcode == ARM::CTSELECTf64) {
+    if (Subtarget.isThumb1Only()) {
+      LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI);
+      return expandCtSelectThumb(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI);
+      return expandCtSelectVector(MI);
+    }
+  }
+
+  if (opcode == ARM::CTSELECTv8i8  ||
+      opcode == ARM::CTSELECTv4i16 ||
+      opcode == ARM::CTSELECTv2i32 ||
+      opcode == ARM::CTSELECTv1i64 ||
+      opcode == ARM::CTSELECTv2f32 ||
+      opcode == ARM::CTSELECTv4f16 ||
+      opcode == ARM::CTSELECTv4bf16 ||
+      opcode == ARM::CTSELECTv16i8 ||
+      opcode == ARM::CTSELECTv8i16 ||
+      opcode == ARM::CTSELECTv4i32 ||
+      opcode == ARM::CTSELECTv2i64 ||
+      opcode == ARM::CTSELECTv4f32 ||
+      opcode == ARM::CTSELECTv2f64 ||
+      opcode == ARM::CTSELECTv8f16 ||
+      opcode == ARM::CTSELECTv8bf16) {
+    LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI);
+    return expandCtSelectVector(MI);
+  }
+
+  if (opcode == ARM::CTSELECTint  || 
+      opcode == ARM::CTSELECTf16  ||
+      opcode == ARM::CTSELECTbf16 ||
+      opcode == ARM::CTSELECTf32) {
+    if (Subtarget.isThumb1Only()) {
+      LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode << "replaced by: " << MI);
+      return expandCtSelectThumb(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI);
+      return expandCtSelect(MI);
+    }
+  }
+
   // This hook gets to expand COPY instructions before they become
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 2869e7f708046..f0e090f09f5dc 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -221,6 +221,12 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
       const TargetRegisterInfo *TRI, Register VReg,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
+  bool expandCtSelectVector(MachineInstr &MI) const;
+
+  bool expandCtSelectThumb(MachineInstr &MI) const;
+
+  bool expandCtSelect(MachineInstr &MI) const;
+
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   bool shouldSink(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 847b7af5a9b11..62f5b21a738dd 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -4200,6 +4200,92 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ARMISD::CTSELECT: {
+    EVT VT = N->getValueType(0);
+    unsigned PseudoOpcode;
+    bool IsFloat = false; 
+    bool IsVector = false;
+
+    if (VT == MVT::f16) {
+      PseudoOpcode = ARM::CTSELECTf16;
+      IsFloat = true;
+    } else if (VT == MVT::bf16) {
+      PseudoOpcode = ARM::CTSELECTbf16;
+      IsFloat = true;
+    } else if (VT == MVT::f32) {
+      PseudoOpcode = ARM::CTSELECTf32;
+      IsFloat = true;
+    } else if (VT == MVT::f64) {
+      PseudoOpcode = ARM::CTSELECTf64;
+      IsVector = true;
+    } else if (VT == MVT::v8i8) {
+      PseudoOpcode = ARM::CTSELECTv8i8;
+      IsVector = true;
+    } else if (VT == MVT::v4i16) {
+      PseudoOpcode = ARM::CTSELECTv4i16;
+      IsVector = true;
+    } else if (VT == MVT::v2i32) {
+      PseudoOpcode = ARM::CTSELECTv2i32;
+      IsVector = true;
+    } else if (VT == MVT::v1i64) {
+      PseudoOpcode = ARM::CTSELECTv1i64;
+      IsVector = true;
+    } else if (VT == MVT::v2f32) {
+      PseudoOpcode = ARM::CTSELECTv2f32;
+      IsVector = true;
+    } else if (VT == MVT::v4f16) {
+      PseudoOpcode = ARM::CTSELECTv4f16;
+      IsVector = true;
+    } else if (VT == MVT::v4bf16) {
+      PseudoOpcode = ARM::CTSELECTv4bf16;
+      IsVector = true;
+    } else if (VT == MVT::v16i8) {
+      PseudoOpcode = ARM::CTSELECTv16i8;
+      IsVector = true;
+    } else if (VT == MVT::v8i16) {
+      PseudoOpcode = ARM::CTSELECTv8i16;
+      IsVector = true;
+    } else if (VT == MVT::v4i32) {
+      PseudoOpcode = ARM::CTSELECTv4i32;
+      IsVector = true;
+    } else if (VT == MVT::v2i64) {
+      PseudoOpcode = ARM::CTSELECTv2i64;
+      IsVector = true;
+    } else if (VT == MVT::v4f32) {
+      PseudoOpcode = ARM::CTSELECTv4f32;
+      IsVector = true;
+    } else if (VT == MVT::v2f64) {
+      PseudoOpcode = ARM::CTSELECTv2f64;
+      IsVector = true;
+    } else if (VT == MVT::v8f16) {
+      PseudoOpcode = ARM::CTSELECTv8f16;
+      IsVector = true;
+    } else if (VT == MVT::v8bf16) {
+      PseudoOpcode = ARM::CTSELECTv8bf16;
+      IsVector = true;
+    } else {
+      // i1, i8, i16, i32, i64
+      PseudoOpcode = ARM::CTSELECTint;
+    }
+
+    SmallVector<EVT, 4> VTs;
+    VTs.push_back(VT);           // $dst
+    VTs.push_back(MVT::i32);     // $tmp_mask (always GPR)
+    
+    if (IsVector) {
+      VTs.push_back(VT);         // $bcast_mask (same type as dst for vectors)
+    } else if (IsFloat) {
+      VTs.push_back(MVT::i32);   // $scratch1 (GPR)
+      VTs.push_back(MVT::i32);   // $scratch2 (GPR)
+    }
+    
+    // src1, src2, cond
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+
+    SDNode *ResNode = CurDAG->getMachineNode(PseudoOpcode, SDLoc(N), VTs, Ops);
+    ReplaceNode(N, ResNode);
+    return;
+  }
   case ARMISD::VZIP: {
     EVT VT = N->getValueType(0);
     // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..c5729aa990bf6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -203,6 +203,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
   setOperationAction(ISD::SELECT,            VT, Expand);
   setOperationAction(ISD::SELECT_CC,         VT, Expand);
   setOperationAction(ISD::VSELECT,           VT, Expand);
+  setOperationAction(ISD::CTSELECT,          VT, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
   if (VT.isInteger()) {
     setOperationAction(ISD::SHL, VT, Custom);
@@ -304,6 +305,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
 
     // Vector reductions
     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
@@ -355,6 +357,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::MSTORE, VT, Legal);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -408,6 +411,28 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
   setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
 
+  if (Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::CTSELECT, MVT::v4f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
+  }
+
+  if (Subtarget->hasBF16()) {
+    setOperationAction(ISD::CTSELECT, MVT::v4bf16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8bf16, Custom);
+  }
+
+  // small exotic vectors get scalarised for ctselect
+  setOperationAction(ISD::CTSELECT, MVT::v1i8,  Expand);
+  setOperationAction(ISD::CTSELECT, MVT::v1i16, Expand);
+  setOperationAction(ISD::CTSELECT, MVT::v1i32, Expand);
+  setOperationAction(ISD::CTSELECT, MVT::v1f32, Expand);
+  setOperationAction(ISD::CTSELECT, MVT::v2i8,  Expand);
+  
+  setOperationAction(ISD::CTSELECT, MVT::v2i16, Promote);
+  setOperationPromotedToType(ISD::CTSELECT, MVT::v2i16, MVT::v4i16);
+  setOperationAction(ISD::CTSELECT, MVT::v4i8, Promote);
+  setOperationPromotedToType(ISD::CTSELECT, MVT::v4i8, MVT::v8i8);
+
   // We 'support' these types up to bitcast/load/store level, regardless of
   // MVE integer-only / float support. Only doing FP data processing on the FP
   // vector types is inhibited at integer-only level.
@@ -419,6 +444,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
     setOperationAction(ISD::VSELECT, VT, Legal);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   }
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
@@ -474,6 +500,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -1237,10 +1264,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::CTSELECT,  MVT::i8,  Promote);
+  setOperationAction(ISD::CTSELECT,  MVT::i16, Promote);
+  setOperationPromotedToType(ISD::CTSELECT, MVT::i16, MVT::i32);
+
+  setOperationAction(ISD::CTSELECT,  MVT::i32, Custom);
+  setOperationAction(ISD::CTSELECT,  MVT::i64, Expand);
+  setOperationAction(ISD::CTSELECT,  MVT::f32, Custom);
+  setOperationAction(ISD::CTSELECT,  MVT::f64, Custom);
+  
+  // Handle f16 and bf16 without falling back to select from ctselect.
+  setTargetDAGCombine({ISD::CTSELECT});
+
   if (Subtarget->hasFullFP16()) {
     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
+    setOperationAction(ISD::CTSELECT,  MVT::f16, Custom);
+  }
+
+  if (Subtarget->hasBF16()) {
+    setOperationAction(ISD::CTSELECT, MVT::bf16, Custom);
   }
 
   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
@@ -1567,6 +1611,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(ARMISD::BCC_i64)
     MAKE_CASE(ARMISD::FMSTAT)
     MAKE_CASE(ARMISD::CMOV)
+    MAKE_CASE(ARMISD::CTSELECT)
     MAKE_CASE(ARMISD::SSAT)
     MAKE_CASE(ARMISD::USAT)
     MAKE_CASE(ARMISD::ASRL)
@@ -5103,6 +5148,20 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  SDValue Cond = Op.getOperand(0);
+  SDValue TrueVal = Op.getOperand(1);
+  SDValue FalseVal = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+
+  // Normalise the condition to 0 or 1.
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+  SDValue CondNode = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One);
+  return DAG.getNode(ARMISD::CTSELECT, DL, VT, TrueVal, FalseVal, CondNode);
+}
+
 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                                  bool &swpCmpOps, bool &swpVselOps) {
   // Start by selecting the GE condition code for opcodes that return true for
@@ -10599,6 +10658,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::CTSELECT:      return LowerCTSELECT(Op, DAG);
   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
@@ -10815,6 +10875,36 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::FP_TO_UINT_SAT:
     Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
     break;
+  case ISD::CTSELECT: {
+      EVT VT = N->getValueType(0);
+    
+      // Handle f16/bf16 type promotion while preserving ctselect
+      if (VT == MVT::f16 || VT == MVT::bf16) {
+        SDLoc DL(N);
+        SDValue Cond = N->getOperand(0);
+        SDValue TrueVal = N->getOperand(1);
+        SDValue FalseVal = N->getOperand(2);
+        
+        // Bitcast to i16, then promote to i32
+        SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal);
+        SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal);
+        
+        TrueInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueInt);
+        FalseInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseInt);
+        
+        // Normalize condition
+        SDValue One = DAG.getConstant(1, DL, MVT::i32);
+        SDValue CondNorm = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One);
+        
+        // Create i32 ctselect that will go through normal lowering
+        Res = DAG.getNode(ISD::CTSELECT, DL, MVT::i32,
+                          CondNorm, TrueInt, FalseInt);
+      } else {
+        // For other types, use existing lowering
+        Res = LowerCTSELECT(SDValue(N, 0), DAG);
+      }
+      break;
+    }
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -13371,6 +13461,63 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
                      DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
 }
 
+static SDValue PerformCTSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize()) {
+      return SDValue();
+  }
+      
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::f16 || VT == MVT::bf16) {
+    SDValue Cond = N->getOperand(0);
+    SDValue TrueVal = N->getOperand(1);
+    SDValue FalseVal = N->getOperand(2);
+    
+    SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal);
+    SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal);
+    
+    // Create i16 ctselect - this will be promoted to i32 ctselect naturally
+    SDValue Result = DAG.getNode(ISD::CTSELECT, DL, MVT::i16,
+                                  Cond, TrueInt, FalseInt);
+    
+    return DAG.getBitcast(VT, Result);
+  } else if (VT.isVector()) {
+    EVT EltVT = VT.getVectorElementType();
+    if (EltVT == MVT::f16 || EltVT == MVT::bf16) {
+      SDValue Cond = N->getOperand(0);
+      SDValue TrueVal = N->getOperand(1);
+      SDValue FalseVal = N->getOperand(2);
+      
+      EVT IntVT;
+      switch (VT.getSimpleVT().SimpleTy) {
+      case MVT::v4f16:
+      case MVT::v4bf16:
+        IntVT = MVT::v4i16;
+        break;
+      case MVT::v8f16:
+      case MVT::v8bf16:
+        IntVT = MVT::v8i16;
+        break;
+      default:
+        return SDValue(); // Unsupported vector type
+      }
+      
+      SDValue TrueInt = DAG.getBitcast(IntVT, TrueVal);
+      SDValue FalseInt = DAG.getBitcast(IntVT, FalseVal);
+      
+      SDValue Result = DAG.getNode(ISD::CTSELECT, DL, IntVT,
+                                  Cond, TrueInt, FalseInt);
+      
+      return DAG.getBitcast(VT, Result);
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformVSELECTCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
@@ -18874,6 +19021,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT_CC:
   case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
+  case ISD::CTSELECT:   return PerformCTSELECTCombine(N, DCI, Subtarget);
   case ISD::SETCC:      return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..5ca1769087873 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -97,6 +97,9 @@ class VectorType;
 
     CMOV, // ARM conditional move instructions.
 
+    CTSELECT, // ARM constant-time select, implemented with constant-time
+              // bitwise arithmetic instructions.
+
     SSAT, // Signed saturation
     USAT, // Unsigned saturation
 
@@ -430,8 +433,12 @@ class VectorType;
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     bool isSelectSupported(SelectSupportKind Kind) const override {
-      // ARM does not support scalar condition selects on vectors.
-      return (Kind != ScalarCondVectorVal);
+      if (Kind == SelectSupportKind::CtSelect) {
+        return true;
+      } else {
+        // ARM does not support scalar condition selects on vectors.
+        return (Kind != SelectSupportKind::ScalarCondVectorVal);
+      }
     }
 
     bool isReadOnly(const GlobalValue *GV) const;
@@ -880,6 +887,7 @@ class VectorType;
     SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
@@ -1025,6 +1033,7 @@ class VectorType;
                                            MachineBasicBlock *MBB) const;
     MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
+
     void addMVEVectorTypes(bool HasMVEFP);
     void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action);
     void setAllExpand(MVT VT);
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 282ff534fc112..b8597f97b43df 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -32,6 +32,13 @@ def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
 
 def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 
+def SDT_ARMCtSelect : SDTypeProfile<1, 3, [
+  /* any */                // result
+  SDTCisSameAs<1, 0>,      // value on false
+  SDTCisSameAs<2, 0>,      // value on true
+  SDTCisVT<3, i32>         // cond
+]>;
+
 def SDT_ARMCMov : SDTypeProfile<1, 4, [
   /* any */                // result
   SDTCisSameAs<1, 0>,      // value on false
@@ -188,6 +195,7 @@ def ARMseretglue     : SDNode<"ARMISD::SERET_GLUE", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMintretglue    : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMctselect      : SDNode<"ARMISD::CTSELECT", SDT_ARMCtSelect>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov>;
 
 def ARMssat   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
@@ -5108,6 +5116,226 @@ def : ARMPat<(ARMcmov i32:$false, mod_imm_not:$imm, imm:$cc, CPSR),
 def : ARMV6T2Pat<(ARMcmov i32:$false, imm:$src, imm:$cc, CPSR),
                  (MOVCCi32imm $false, imm:$src, imm:$cc, CPSR)>;
 
+//===----------------------------------------------------------------------===//
+// Constant-time selection pseudoinstructions. 
+// We use a machine pass to lower these pseudos as applicable by subtarget,
+// in order to avoid backend optimizations that could invalidate constant-time
+// guarantees to the source programmer by node merging or other operations that
+// would result in machine code that does not run in constant time.
+let isNotDuplicable = 1, 
+    isPseudo = 1,  
+    hasNoSchedulingInfo = 1 in {  
+
+  // i1, i8, i16, i32, i64
+  def CTSELECTint  : ARMPseudoInst<
+        (outs GPR:$dst, GPR:$tmp_mask),
+        (ins  GPR:$src1, GPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask";
+  }
+
+  def CTSELECTf16  : ARMPseudoInst<
+      (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+      (ins  HPR:$src1, HPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+  > {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2";
+  }
+
+  def CTSELECTbf16  : ARMPseudoInst<
+      (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+      (ins  HPR:$src1, HPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+  > {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2";
+  }
+
+  def CTSELECTf32  : ARMPseudoInst<
+      (outs SPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+      (ins  SPR:$src1, SPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $scratch1,@earlyclobber $scratch2";
+    }
+
+  let Predicates = [HasDPVFP] in {
+    def CTSELECTf64  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+  }
+
+  let Predicates = [HasNEON] in {
+    // DPR
+    def CTSELECTv8i8  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv4i16  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv2i32  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv1i64  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv2f32  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+      > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv4f16  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv4bf16  : ARMPseudoInst<
+        (outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+        (ins  DPR:$src1, DPR:$src2, GPR:$cond),
+        4,
+        NoItinerary,
+        []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+
+    // QPR
+    def CTSELECTv16i8  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv8i16  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv4i32  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv2i64  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv4f32  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv2f64  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv8f16  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+    def CTSELECTv8bf16  : ARMPseudoInst<
+      (outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+      (ins  QPR:$src1, QPR:$src2, GPR:$cond),
+      4,
+      NoItinerary,
+      []
+    > {
+      let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_mask,@earlyclobber $bcast_mask";
+    }
+
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Atomic operations intrinsics
 //
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 86740a92b32c5..18d47d9c68767 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -519,7 +519,8 @@ void ARMPassConfig::addPreEmitPass() {
 
   // Constant island pass work on unbundled instructions.
   addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
-    return MF.getSubtarget<ARMSubtarget>().isThumb2();
+    return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
+    MF.getFunction().hasFnAttribute("ct-select");
   }));
 
   // Don't optimize barriers or block placement at -O0.
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8e08d16342975..d306d489a43d2 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -828,9 +828,10 @@ include "X86SchedSapphireRapids.td"
 
 def ProcessorFeatures {
   // x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
-  list<SubtargetFeature> X86_64V1Features = [
-    FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
-    FeatureFXSR, FeatureNOPL, FeatureX86_64,
+  list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+                                             FeatureCMOV, FeatureMMX,
+                                             FeatureSSE2, FeatureFXSR,
+                                             FeatureNOPL, FeatureX86_64,
   ];
   list<SubtargetFeature> X86_64V1Tuning = [
     TuningMacroFusion,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a0b64ff370b10..a11ef3833b69b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86ShuffleDecode.h"
 #include "X86.h"
 #include "X86FrameLowering.h"
@@ -29,6 +30,8 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -48,6 +51,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
@@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // X86 wants to expand cmov itself.
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
@@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BR_CC, VT, Action);
     setOperationAction(ISD::SETCC, VT, Action);
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SELECT_CC, VT, Action);
     setOperationAction(ISD::FROUND, VT, Action);
     setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
 
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
@@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
+    setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
+
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
@@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
+    setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
+
     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
@@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
@@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
@@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
@@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VSELECT,            VT, Legal);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
 
       setOperationAction(ISD::FNEG,               VT, Custom);
       setOperationAction(ISD::FABS,               VT, Custom);
@@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
   }
 
+  // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand
+  // This allows type legalization to split them into smaller vectors
+  for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+                  MVT::v16f32, MVT::v8f64}) {
+    setOperationAction(ISD::CTSELECT, VT, Expand);
+  }
+
+  // Handle 256-bit vector CTSELECT without AVX by setting them to Expand
+  // This allows type legalization to split them into 128-bit vectors
+  if (!Subtarget.hasAVX()) {
+    for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+                    MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+      setOperationAction(ISD::CTSELECT, VT, Expand);
+    }
+  }
+
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2643,6 +2688,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::BITCAST,
                        ISD::VSELECT,
                        ISD::SELECT,
+                       ISD::CTSELECT,
                        ISD::SHL,
                        ISD::SRA,
                        ISD::SRL,
@@ -25321,6 +25367,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
   return V;
 }
 
+SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0); // condition
+  SDValue TrueOp = Op.getOperand(1);  // true_value
+  SDValue FalseOp = Op.getOperand(2); // false_value
+  SDLoc DL(Op);
+  MVT VT = TrueOp.getSimpleValueType();
+
+  // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+  // pseudos Let standard type legalization handle i64 automatically (splits
+  // into EDX:EAX)
+
+  // Handle soft float16 by converting to integer operations
+  if (isSoftF16(VT, Subtarget)) {
+    MVT NVT = VT.changeTypeToInteger();
+    SDValue CtSelect =
+        DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+                    DAG.getBitcast(NVT, TrueOp));
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Handle vector types
+  if (VT.isVector()) {
+    // Handle soft float16 vectors
+    if (isSoftF16(VT, Subtarget)) {
+      MVT NVT = VT.changeVectorElementTypeToInteger();
+      SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond,
+                                     DAG.getBitcast(NVT, FalseOp),
+                                     DAG.getBitcast(NVT, TrueOp));
+      return DAG.getBitcast(VT, CtSelect);
+    }
+
+    unsigned VectorWidth = VT.getSizeInBits();
+    MVT EltVT = VT.getVectorElementType();
+
+    // 512-bit vectors without AVX512 are now handled by type legalization
+    // (Expand action) 256-bit vectors without AVX are now handled by type
+    // legalization (Expand action)
+
+    if (VectorWidth == 128 && !Subtarget.hasSSE1())
+      return SDValue();
+
+    // Handle special cases for floating point vectors
+    if (EltVT.isFloatingPoint()) {
+      // For vector floating point with AVX, use VBLENDV-style operations
+      if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+        // Convert to bitwise operations using the condition
+        MVT IntVT = VT.changeVectorElementTypeToInteger();
+        SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+        SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+        // Create the CTSELECT node with integer types
+        SDValue IntResult =
+            DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1,
+                        DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+                        EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+        return DAG.getBitcast(VT, IntResult);
+      }
+    }
+
+    // For integer vectors or when we don't have advanced SIMD support,
+    // use the generic X86 CTSELECT node which will be matched by the patterns
+    SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+    SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    // Create the X86 CTSELECT node - note operand order: true, false, cc, flags
+    return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1)
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  /// Process condition flags and prepare for CTSELECT node creation
+  auto ProcessConditionFlags =
+      [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+          const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+    SDValue CC;
+    bool AddTest = true;
+
+    unsigned CondOpcode = Cond.getOpcode();
+    if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+      CC = Cond.getOperand(0);
+      SDValue Cmp = Cond.getOperand(1);
+
+      if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+        Cond = Cmp;
+        AddTest = false;
+      }
+    } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+               CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+               CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+      SDValue Value;
+      X86::CondCode X86Cond;
+      std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+      CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+      AddTest = false;
+    }
+
+    if (AddTest) {
+      // Look past the truncate if the high bits are known zero
+      if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+      // Try to match AND to BT instruction
+      if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+        X86::CondCode X86CondCode;
+        if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+          CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+          Cond = BT;
+          AddTest = false;
+        }
+      }
+    }
+
+    if (AddTest) {
+      CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+      Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    }
+
+    return {CC, Cond};
+  };
+
+  // Process condition flags and prepare for CTSELECT
+  auto [CC, ProcessedCond] =
+      ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+  // Handle i8 CTSELECT with truncate optimization
+  if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+      FalseOp.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        T1.getOpcode() != ISD::CopyFromReg &&
+        T2.getOpcode() != ISD::CopyFromReg) {
+      SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(),
+                                     T2, T1, CC, ProcessedCond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+    }
+  }
+
+  // Promote small integer types to avoid partial register stalls
+  // Exception: For i8 without CMOV, we can generate a shorter instruction
+  // sequence without movzx so keep it as is.
+  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+       !X86::mayFoldLoad(FalseOp, Subtarget))) {
+    TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+    FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops);
+    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+  }
+
+  if (isScalarFPTypeInSSEReg(VT)) {
+    MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+    TrueOp = DAG.getBitcast(IntVT, TrueOp);
+    FalseOp = DAG.getBitcast(IntVT, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops);
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Create final CTSELECT node
+  SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+  return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
+                     Op->getFlags());
+}
+
 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
@@ -33684,6 +33898,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::CTSELECT:           return LowerCTSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
@@ -33767,6 +33982,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+  if (Kind == SelectSupportKind::CtSelect) {
+    return true;
+  }
+  return TargetLoweringBase::isSelectSupported(Kind);
+}
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -34994,6 +35215,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STRICT_CMPM)
   NODE_NAME_CASE(CMPMM_SAE)
   NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(CTSELECT)
   NODE_NAME_CASE(SETCC_CARRY)
   NODE_NAME_CASE(FSETCC)
   NODE_NAME_CASE(FSETCCM)
@@ -37767,6 +37989,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
   return BB;
 }
 
+/// Helper function to emit i386 CTSELECT with condition materialization.
+/// This converts EFLAGS-based CTSELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned InternalPseudoOpcode) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+  Register Src1Reg = MI.getOperand(1).getReg();
+  Register Src2Reg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+  // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+  // but we want to select src1 when condition is FALSE for X86 semantics)
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Step 1: Materialize condition byte from EFLAGS
+  // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  // Step 2: Create internal pseudo that takes condition byte as input
+  // This pseudo will be expanded post-RA into the actual constant-time bundle
+  // The condition byte can now be safely shared between multiple pseudos
+
+  // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+  // src2, cond_byte)
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // Create virtual registers for the temporary outputs
+  Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  Register TmpMaskReg;
+
+  // Determine the register class for tmp_mask based on the data type
+  if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+  } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+  } else {
+    llvm_unreachable("Unknown internal pseudo opcode");
+  }
+
+  BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+      .addDef(DstReg)         // dst (output)
+      .addDef(TmpByteReg)     // tmp_byte (output)
+      .addDef(TmpMaskReg)     // tmp_mask (output)
+      .addReg(Src1Reg)        // src1 (input)
+      .addReg(Src2Reg)        // src2 (input)
+      .addReg(CondByteReg);   // pre-materialized condition byte (input)
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+  bool IsValid = false;
+  unsigned BaseReg = 0;
+  int64_t ScaleVal = 1;
+  unsigned IndexReg = 0;
+  int64_t Disp = 0;
+  unsigned SegReg = 0;
+  int FrameIndex = -1;
+  bool IsFrameIndex = false;
+  int ConstantPoolIndex = -1;
+  bool IsConstantPool = false;
+  const GlobalValue *Global = nullptr;
+  int64_t GlobalOffset = 0;
+  bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+                                               MachineRegisterInfo &MRI,
+                                               unsigned ExpectedLoadOpcode) {
+  FPLoadMemOperands Result;
+
+  if (!Reg.isVirtual())
+    return Result;
+
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+  if (!DefMI)
+    return Result;
+
+  // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+  if (DefMI->getOpcode() != ExpectedLoadOpcode)
+    return Result;
+
+  // Check that this is a simple load - not volatile, not atomic, etc.
+  // FP loads have hasSideEffects = 0 in their definition for simple loads
+  if (DefMI->hasOrderedMemoryRef())
+    return Result;
+
+  // The load should have a single def (the destination register) and memory operands
+  // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+  // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+  if (DefMI->getNumOperands() < 6)
+    return Result;
+
+  // Operand 0 is the destination, operands 1-5 are the memory reference
+  MachineOperand &BaseMO = DefMI->getOperand(1);
+  MachineOperand &ScaleMO = DefMI->getOperand(2);
+  MachineOperand &IndexMO = DefMI->getOperand(3);
+  MachineOperand &DispMO = DefMI->getOperand(4);
+  MachineOperand &SegMO = DefMI->getOperand(5);
+
+  // Check if this is a frame index load
+  if (BaseMO.isFI()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = true;
+    Result.FrameIndex = BaseMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a constant pool load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isCPI() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsConstantPool = true;
+    Result.ConstantPoolIndex = DispMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a global variable load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isGlobal() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsGlobal = true;
+    Result.Global = DispMO.getGlobal();
+    Result.GlobalOffset = DispMO.getOffset();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Regular memory operands (e.g., pointer loads)
+  if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+      DispMO.isImm() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = false;
+    Result.IsConstantPool = false;
+    Result.BaseReg = BaseMO.getReg();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+                                                     MachineBasicBlock *BB,
+                                                     unsigned pseudoInstr) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  unsigned RegSizeInByte = 4;
+
+  // Get operands
+  // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned FalseReg = MI.getOperand(1).getReg();
+  unsigned TrueReg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Materialize condition byte from EFLAGS
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+        .addReg(Reg, RegState::Kill);
+  };
+
+  // Helper to load integer from memory operands
+  auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+                                     unsigned Offset) -> unsigned {
+    unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+    if (MemOps.IsFrameIndex) {
+      // Frame index: addFrameIndex + scale + index + disp + segment
+      MIB.addFrameIndex(MemOps.FrameIndex)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    } else if (MemOps.IsConstantPool) {
+      // Constant pool: base_reg + scale + index + CP_index + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset)  // Displacement (CP index)
+          .addReg(MemOps.SegReg);  // Segment
+    } else if (MemOps.IsGlobal) {
+      // Global variable: base_reg + scale + index + global + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset)  // Displacement (global address)
+          .addReg(MemOps.SegReg);  // Segment
+    } else {
+      // Regular memory: base_reg + scale + index + disp + segment
+      MIB.addReg(MemOps.BaseReg)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    }
+
+    return IntReg;
+  };
+
+  // Optimized path: load integers directly from memory when both operands are
+  // memory loads, avoiding FP register round-trip
+  auto emitCtSelectFromMemory = [&](unsigned NumValues,
+                                     const FPLoadMemOperands &TrueMemOps,
+                                     const FPLoadMemOperands &FalseMemOps,
+                                     int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+
+      // Load true and false values directly from their memory locations as integers
+      unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+      unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)    // dst (output)
+          .addDef(TmpByteReg)      // tmp_byte (output)
+          .addDef(TmpMaskReg)      // tmp_mask (output)
+          .addReg(FalseIntReg)     // src1 (input) - false value
+          .addReg(TrueIntReg)      // src2 (input) - true value
+          .addReg(CondByteReg);    // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+      
+      // Load true and false values from stack as 32-bit integers
+      unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+          .addFrameIndex(TrueSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+          .addFrameIndex(FalseSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)     // dst (output)
+          .addDef(TmpByteReg)       // tmp_byte (output)
+          .addDef(TmpMaskReg)       // tmp_mask (output)
+          .addReg(FalseIntReg)      // src1 (input) - false value
+          .addReg(TrueIntReg)       // src2 (input) - true value
+          .addReg(CondByteReg);     // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  switch (pseudoInstr) {
+  case X86::CTSELECT_I386_FP32rr: {
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+    int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      // and have loaded the data directly as integers instead
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f32
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CTSELECT_I386_FP64rr: {
+    unsigned StackSlotSize = 8;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+    int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+                             ResultSlot);
+    }
+
+    // Load result back as f64
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CTSELECT_I386_FP80rr: {
+    // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+    unsigned StackObjectSize = 12;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+    int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+                             FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f80
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid CTSELECT opcode");
+  }
+
+  MI.eraseFromParent();
+
+  return BB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -37828,6 +38524,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::CMOV_VK64:
     return EmitLoweredSelect(MI, BB);
 
+  case X86::CTSELECT_I386_GR8rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR8rr);
+
+  case X86::CTSELECT_I386_GR16rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR16rr);
+
+  case X86::CTSELECT_I386_GR32rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR32rr);
+
+  case X86::CTSELECT_I386_FP32rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr);
+  case X86::CTSELECT_I386_FP64rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
+  case X86::CTSELECT_I386_FP80rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
+    
   case X86::FP80_ADDr:
   case X86::FP80_ADDm32: {
     // Change the floating point control register to use double extended
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..f79eec03de23c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -114,6 +114,10 @@ namespace llvm {
     /// X86 Select
     SELECTS,
 
+    /// X86 Constant-time Select, implemented with CMOV instruction. This is
+    /// used to implement constant-time select.
+    CTSELECT,
+
     // Same as SETCC except it's materialized with a sbb and the value is all
     // one's or all zero's.
     SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1139,6 +1143,8 @@ namespace llvm {
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+    bool isSelectSupported(SelectSupportKind Kind) const override;
+
     /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
@@ -1766,6 +1772,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 7d5d7cf4a83ab..9c34889f03354 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
   def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
             (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+  multiclass CTSELECT<X86TypeInfo t> {
+    // register-only
+    let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rr : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+    }
+
+    // register-memory
+    let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rm : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+    }
+  }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Constraints = "$dst = $src1" in {
+    defm CTSELECT16 : CTSELECT<Xi16>;
+    defm CTSELECT32 : CTSELECT<Xi32>;
+    defm CTSELECT64 : CTSELECT<Xi64>;
+  }
+}
+
+// CTSELECT_VEC base class
+class CTSELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+    : PseudoI<
+        (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+        (ins  VRc:$t,   VRc:$f,   i8imm:$cond),
+        []
+      > {
+  let Uses            = [EFLAGS];
+  let isPseudo        = 1;
+  let isNotDuplicable = 1;
+  let hasSideEffects  = 1;
+  let AsmString       = "ctselect\t$dst, $f, $t, $cond";
+  let SchedRW         = [];
+}
+
+// Width-specific class aliases
+class CTSELECT_VEC128  : CTSELECT_VEC<VR128,  GR32>;
+class CTSELECT_VEC128X : CTSELECT_VEC<VR128X, GR32>;
+class CTSELECT_VEC256  : CTSELECT_VEC<VR256,  GR32>;
+class CTSELECT_VEC512  : CTSELECT_VEC<VR512,  GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  def CTSELECT_V4F32 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasSSE2] in {
+
+  def CTSELECT_V2F64 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I32 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V2I64 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I16 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I8 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CTSELECT_V8F16 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasAVX] in {
+
+  def CTSELECT_V4F32X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V2F64X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I32X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V2I64X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I16X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I8X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CTSELECT_V8F16X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+  def CTSELECT_V8F32  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V4F64  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I32  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I64  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I16 : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+  def CTSELECT_V32I8  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+
+  // If your build has v16f16, keep this; otherwise comment it out.
+  def CTSELECT_V16F16 : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmpx,@earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V*
+//
+// NOTE:
+//  * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+//  * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+//  * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  // 128-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+  // 128-bit integer
+  def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+  // 128-bit f16 (optional)
+  def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+  // 256-bit integer
+  def : Pat<(v8i32  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8I32  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4i64  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4I64  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v32i8  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V32I8  VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit f16 (optional)
+  def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
 let Predicates = [HasCMOV, HasCF] in {
   def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
             (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..f4163f55d66ce 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -693,6 +693,86 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
 def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 
+// CTSELECT
+// Enhanced CTSELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+  // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+  // These are matched by patterns and convert EFLAGS to condition byte
+  class CTSELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+      : PseudoI<(outs RC:$dst),
+                (ins RC:$src1, RC:$src2, i8imm:$cond),
+                [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond,
+                                        EFLAGS)))]> {
+    let Uses = [EFLAGS];
+    let Defs = [EFLAGS];
+    let usesCustomInserter = 1;
+    let hasNoSchedulingInfo = 1;
+  }
+
+  // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+  // These generate the actual constant-time instruction bundles
+  class CTSELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+      : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+                (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+    let hasNoSchedulingInfo = 1;
+    let Constraints = "@earlyclobber $dst,@earlyclobber $tmp_byte,@earlyclobber $tmp_mask";
+  }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL<GR8, i8>;
+    def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL<GR16, i16>;
+    def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL<GR32, i32>;
+  }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CTSELECT_I386_INT_GR8rr :
+        CTSELECT_I386_INTERNAL<GR8, GR8>;
+    def CTSELECT_I386_INT_GR16rr :
+        CTSELECT_I386_INTERNAL<GR16, GR8>;
+    def CTSELECT_I386_INT_GR32rr :
+        CTSELECT_I386_INTERNAL<GR32, GR8>;
+  }
+}
+
+let hasSideEffects = 1,
+    ForceDisassemble = 1,
+    Constraints = "$dst = $src1" in {
+
+  let Predicates = [FPStackf32] in
+    def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL<RFP64, f64>;
+
+  def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+  def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+  def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+  def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+  // i64 patterns handled automatically by type legalization
+}
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 116986a0fffea..4c9e5bae3b46c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
 def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
                                            [SDTCisSameAs<0, 2>,
@@ -151,6 +155,7 @@ def X86ctest   : SDNode<"X86ISD::CTEST",    SDTX86Ccmp>;
 def X86cload    : SDNode<"X86ISD::CLOAD",   SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86cstore   : SDNode<"X86ISD::CSTORE",  SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>;
 def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
 def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
                         [SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1d2cd39951bf4..ef270fc49a224 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
   return false;
 }
 
+struct CtSelectInstructions {
+  unsigned PAndOpc;
+  unsigned PAndnOpc;
+  unsigned POrOpc;
+  unsigned BroadcastOpc;
+  unsigned IntMoveOpc;
+  unsigned MoveOpc;
+  bool Use256;
+  bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+  CtSelectInstructions Instructions = {};
+
+  switch (Opcode) {
+  case X86::CTSELECT_V2F64:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPDrr;
+      Instructions.UseBlendInstr = true;
+    } else {
+      llvm_unreachable("Double precision vectors require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V4F32:
+    if (Subtarget.hasSSE41()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+      Instructions.UseBlendInstr = true;
+    } else if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    } else {
+      // fallback to SSE1, only support four 32-bit single precision
+      // floating-point values
+      Instructions.PAndOpc = X86::ANDPSrr;
+      Instructions.PAndnOpc = X86::ANDNPSrr;
+      Instructions.POrOpc = X86::ORPSrr;
+      Instructions.BroadcastOpc = X86::SHUFPSrri;
+      Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    }
+    break;
+  case X86::CTSELECT_V4I32:
+  case X86::CTSELECT_V2I64:
+  case X86::CTSELECT_V8I16:
+  case X86::CTSELECT_V16I8:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("Integer vector operations require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V8F16:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("FP16 vector operations require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V4F32X:
+  case X86::CTSELECT_V4I32X:
+  case X86::CTSELECT_V2F64X:
+  case X86::CTSELECT_V2I64X:
+  case X86::CTSELECT_V8I16X:
+  case X86::CTSELECT_V16I8X:
+  case X86::CTSELECT_V8F16X:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDrr;
+      Instructions.PAndnOpc = X86::VPANDNrr;
+      Instructions.POrOpc = X86::VPORrr;
+      Instructions.BroadcastOpc = X86::VPSHUFDri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr
+                             : (Opcode == X86::CTSELECT_V2F64X)
+                                 ? X86::VMOVAPDrr
+                                 : X86::VMOVDQArr;
+    } else {
+      llvm_unreachable("AVX variants require AVX support");
+    }
+    break;
+  case X86::CTSELECT_V8F32:
+  case X86::CTSELECT_V8I32:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CTSELECT_V4F64:
+  case X86::CTSELECT_V4I64:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPDYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CTSELECT_V16I16:
+  case X86::CTSELECT_V32I8:
+  case X86::CTSELECT_V16F16:
+    if (Subtarget.hasAVX2()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit integer vectors require AVX");
+    }
+    break;
+  default:
+    llvm_unreachable("Unexpected CTSELECT opcode");
+  }
+
+  return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const DebugLoc &DL = MI.getDebugLoc();
+  auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // Operand layout matches the TableGen definition:
+  // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+  // (ins  VR128:$t, VR128:$f, i8imm:$cond)
+  Register Dst = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();  // vector mask temp
+  Register TmpGPR = MI.getOperand(2).getReg();   // scalar mask temp (GPR32)
+  Register FalseVal = MI.getOperand(3).getReg(); // true_value
+  Register TrueVal = MI.getOperand(4).getReg();  // false_value
+  X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+  MachineInstr *FirstInstr = nullptr;
+  MachineInstr *LastInstr = nullptr;
+  auto recordInstr = [&](MachineInstrBuilder MIB) {
+    MachineInstr *NewMI = MIB.getInstr();
+    LastInstr = NewMI;
+    if (!FirstInstr)
+      FirstInstr = NewMI;
+  };
+
+  // Create scalar mask in tempGPR and broadcast to vector mask
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+                  .addImm(0)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+                  .addReg(SubReg)
+                  .addImm(CC)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+                  .addReg(SubReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+    // %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+                    .addReg(TmpGPR)
+                    .addImm(31));
+  } else {
+    // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+                    .addReg(TmpGPR));
+  }
+
+  // Broadcast to TmpX (vector mask)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+                  .addReg(MaskReg)
+                  .addReg(MaskReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Move scalar mask to vector register
+  recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+                  .addReg(TmpGPR)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.Use256) {
+    // Broadcast to 256-bit vector register
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                    .addReg(MaskReg)
+                    .addImm(0)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  } else {
+    if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  }
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Use dedicated blend instructions for SSE4.1+
+    unsigned BlendOpc;
+    switch (Opcode) {
+    case X86::CTSELECT_V4F32:
+      BlendOpc = X86::BLENDVPSrr0;
+      break;
+    case X86::CTSELECT_V2F64:
+      BlendOpc = X86::BLENDVPDrr0;
+      break;
+    default:
+      // alias for pblendvb that takes xmm0 as implicit mask register
+      BlendOpc = X86::PBLENDVBrr0;
+      break;
+    }
+
+    // Check if XMM0 is used as one of source registers, if yes then save it
+    // in Dst register and update FalseVal and TrueVal to Dst register
+    bool DidSaveXMM0 = false;
+    Register SavedXMM0 = X86::XMM0;
+    if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+      Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+      // if XMM0 is one of the source registers, it will not match with Dst
+      // registers, so we need to move it to Dst register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(SrcXMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // update FalseVal and TrueVal to Dst register
+      if (FalseVal == X86::XMM0)
+        FalseVal = Dst;
+      if (TrueVal == X86::XMM0)
+        TrueVal = Dst;
+
+      // update SavedXMM0 to Dst register
+      SavedXMM0 = Dst;
+
+      // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+      // register
+      DidSaveXMM0 = true;
+    } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+      // if XMM0 is not allocated for any of the register, we stil need to save
+      // and restore it after using as mask register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      SavedXMM0 = Dst;
+      DidSaveXMM0 = true;
+    }
+
+    if (MaskReg != X86::XMM0) {
+      // BLENDV uses XMM0 as implicit mask register
+      // https://www.felixcloutier.com/x86/pblendvb
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                      .addReg(MaskReg)
+                      .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+      // move FalseVal to mask (use MaskReg as the dst of the blend)
+      recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // restore XMM0 from SavedXMM0 if we saved it into Dst
+      if (DidSaveXMM0) {
+        recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                        .addReg(SavedXMM0)
+                        .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      }
+      // dst = result (now in MaskReg)
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(MaskReg)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // Dst := blend(Dst /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+                      .addReg(Dst)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  } else {
+
+    // dst = mask
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // mask &= true_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+                    .addReg(MaskReg)
+                    .addReg(TrueVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst = ~mask & false_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+                    .addReg(Dst)
+                    .addReg(FalseVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst |= mask; (mask & t) | (~mask & f)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+                    .addReg(Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  }
+
+  assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+  auto BundleEnd = LastInstr->getIterator();
+  finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+  MachineOperand &OperandRes = MI.getOperand(0);  // destination register
+  MachineOperand &OperandTrue = MI.getOperand(1); // true value
+  MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+  assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+         "Invalid operand types");
+  assert(OperandTrue.getReg() == OperandRes.getReg() &&
+         "Result register different from True register");
+
+  assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+  unsigned Opcode = 0;
+
+  switch (MI.getOpcode()) {
+  case X86::CTSELECT16rr:
+    Opcode = X86::CMOV16rr;
+    break;
+  case X86::CTSELECT32rr:
+    Opcode = X86::CMOV32rr;
+    break;
+  case X86::CTSELECT64rr:
+    Opcode = X86::CMOV64rr;
+    break;
+  case X86::CTSELECT16rm:
+    Opcode = X86::CMOV16rm;
+    break;
+  case X86::CTSELECT32rm:
+    Opcode = X86::CMOV32rm;
+    break;
+  case X86::CTSELECT64rm:
+    Opcode = X86::CMOV64rm;
+    break;
+  default:
+    llvm_unreachable("Invalid CTSELECT opcode");
+  }
+
+  if (!Subtarget.hasCMOV()) {
+    llvm_unreachable("target does not support cmov");
+  }
+
+  // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+  // and add condition code
+  MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+  for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+    CmovBuilder.add(MI.getOperand(i));
+  }
+
+  // Remove the original CTSELECT instruction
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+  // (ins src1, src2, cond_byte)
+  // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpByteReg = MI.getOperand(1).getReg();
+  Register TmpMaskReg = MI.getOperand(2).getReg();
+  Register Src1Reg = MI.getOperand(3).getReg();
+  Register Src2Reg = MI.getOperand(4).getReg();
+  Register CondByteReg = MI.getOperand(5).getReg();  // Pre-materialized condition byte
+
+  // Determine instruction opcodes based on register width
+  unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+  if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
+    MovZXOp = 0;  // No zero-extend needed for GR8
+    NegOp = X86::NEG8r;
+    MovOp = X86::MOV8rr;
+    AndOp = X86::AND8rr;
+    NotOp = X86::NOT8r;
+    OrOp = X86::OR8rr;
+  } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) {
+    MovZXOp = X86::MOVZX16rr8;
+    NegOp = X86::NEG16r;
+    MovOp = X86::MOV16rr;
+    AndOp = X86::AND16rr;
+    NotOp = X86::NOT16r;
+    OrOp = X86::OR16rr;
+  } else { // X86::CTSELECT_I386_INT_GR32rr
+    MovZXOp = X86::MOVZX32rr8;
+    NegOp = X86::NEG32r;
+    MovOp = X86::MOV32rr;
+    AndOp = X86::AND32rr;
+    NotOp = X86::NOT32r;
+    OrOp = X86::OR32rr;
+  }
+
+  // 7-instruction constant-time selection bundle (no SETCC inside):
+  // result = (true_val & mask) | (false_val & ~mask)
+  // The condition byte is already materialized, avoiding EFLAGS dependency
+
+  // Step 1: Copy pre-materialized condition byte to TmpByteReg
+  // This allows the bundle to work with allocated temporaries
+  auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+      .addReg(CondByteReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  auto BundleStart = I1->getIterator();
+
+  // Step 2: Zero-extend condition byte to register width (0 or 1)
+  if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) {
+    BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+        .addReg(TmpByteReg)
+        .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  }
+
+  // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+  Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+  BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+  BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+      .addReg(Src1Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 6: Create inverted mask inline (~mask)
+  BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 7: Apply inverted mask to false value - reuse mask register directly
+  BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+      .addReg(MaskReg)
+      .addReg(Src2Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+  auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Bundle all generated instructions for atomic execution before removing MI
+  auto BundleEnd = std::next(LI->getIterator());
+  if (BundleStart != BundleEnd) {
+    // Only bundle if we have multiple instructions
+    finalizeBundle(*MBB, BundleStart, BundleEnd);
+  }
+
+  // TODO: Optimization opportunity - The register allocator may choose callee-saved
+  // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+  // save/restore overhead. Consider constraining these to caller-saved register
+  // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+  // constant-time performance by eliminating prologue/epilogue instructions.
+
+  // Remove the original pseudo instruction
+  MI.eraseFromParent();
+  return true;
+}
+
 static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
   switch (Opcode) {
   default:
@@ -6411,6 +6961,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::ADD64ri32_DB:
     MIB->setDesc(get(X86::OR64ri32));
     break;
+
+  case X86::CTSELECT64rr:
+  case X86::CTSELECT32rr:
+  case X86::CTSELECT16rr:
+  case X86::CTSELECT64rm:
+  case X86::CTSELECT32rm:
+  case X86::CTSELECT16rm:
+    // These CTSELECT pseudos are only selected when CMOV is available
+    // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available
+    return expandCtSelectWithCMOV(MI);
+
+  // non-cmov CTSELECT expansion (post-RA, constant-time)
+  // These are the internal pseudos with pre-materialized condition byte
+  case X86::CTSELECT_I386_INT_GR8rr:
+  case X86::CTSELECT_I386_INT_GR16rr:
+  case X86::CTSELECT_I386_INT_GR32rr:
+    return expandCtSelectIntWithoutCMOV(MI);
+
+  case X86::CTSELECT_V2F64:
+  case X86::CTSELECT_V4F32:
+  case X86::CTSELECT_V2I64:
+  case X86::CTSELECT_V4I32:
+  case X86::CTSELECT_V8I16:
+  case X86::CTSELECT_V16I8:
+  case X86::CTSELECT_V2F64X:
+  case X86::CTSELECT_V4F32X:
+  case X86::CTSELECT_V2I64X:
+  case X86::CTSELECT_V4I32X:
+  case X86::CTSELECT_V8I16X:
+  case X86::CTSELECT_V16I8X:
+  case X86::CTSELECT_V4I64:
+  case X86::CTSELECT_V8I32:
+  case X86::CTSELECT_V16I16:
+  case X86::CTSELECT_V32I8:
+  case X86::CTSELECT_V4F64:
+  case X86::CTSELECT_V8F32:
+    return expandCtSelectVector(MI);
   }
   return false;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..ebd7e070d5fe8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
                       int &FrameIndex) const;
 
+  /// Expand the CTSELECT pseudo-instructions.
+  bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+  bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+  bool expandCtSelectVector(MachineInstr &MI) const;
+
   /// Returns true iff the routine could find two commutable operands in the
   /// given machine instruction with 3 vector inputs.
   /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index c20bb05018b4d..23841034ed411 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -49,6 +49,11 @@ def HasZU        : Predicate<"Subtarget->hasZU()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV  : Predicate<"!Subtarget->hasCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
 def HasMMX       : Predicate<"Subtarget->hasMMX()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8dd6f3d97ccea..a776b54912c16 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() {
     // ObjC runtime functions present in the module.
     const Function &F = MF.getFunction();
     const Module *M = F.getParent();
-    return M->getModuleFlag("kcfi") ||
+    return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
            (TT.isOSDarwin() &&
             (M->getFunction("objc_retainAutoreleasedReturnValue") ||
-             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+             F.hasFnAttribute("ct-select");
   }));
 
   // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/AArch64/ctselect.ll b/llvm/test/CodeGen/AArch64/ctselect.ll
new file mode 100644
index 0000000000000..4cde9fe8a866a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctselect.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=DEFAULT,NOFP16
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=DEFAULT,FP16
+
+define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
+; DEFAULT-LABEL: ct_i1:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %1
+}
+
+define i8 @ct_i8(i1 %cond, i8 %a, i8 %b) {
+; DEFAULT-LABEL: ct_i8:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %1
+}
+
+define i16 @ct_i16(i1 %cond, i16 %a, i16 %b) {
+; DEFAULT-LABEL: ct_i16:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %1
+}
+
+define i32 @ct_i32(i1 %cond, i32 %a, i32 %b) {
+; DEFAULT-LABEL: ct_i32:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+define i64 @ct_i64(i1 %cond, i64 %a, i64 %b) {
+; DEFAULT-LABEL: ct_i64:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %1
+}
+
+define i128 @ct_i128(i1 %cond, i128 %a, i128 %b) {
+; DEFAULT-LABEL: ct_i128:
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+  ret i128 %1
+}
+
+define half @ct_f16(i1 %cond, half %a, half %b) {
+; DEFAULT-LABEL: ct_f16:
+; NOFP16: fcvt
+; NOFP16: csel
+; FP16: fcsel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+; NOFP16: fcvt
+  %1 = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b)
+  ret half %1
+}
+
+define float @ct_f32(i1 %cond, float %a, float %b) {
+; DEFAULT-LABEL: ct_f32:
+; DEFAULT: fcsel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %1
+}
+
+define double @ct_f64(i1 %cond, double %a, double %b) {
+; DEFAULT-LABEL: ct_f64:
+; DEFAULT: fcsel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{mov|ldr}}
+  %1 = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %1
+}
+
+define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; DEFAULT-LABEL: ct_v4i32:
+; DEFAULT: csel
+; DEFAULT: csel
+; DEFAULT: csel
+; DEFAULT: csel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{ldr}}
+  %1 = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; DEFAULT-LABEL: ct_v4f32:
+; DEFAULT: fcsel
+; DEFAULT: fcsel
+; DEFAULT: fcsel
+; DEFAULT: fcsel
+; DEFAULT-NOT: b{{eq|ne}}
+; DEFAULT-NOT: j
+; DEFAULT-NOT: {{ldr}}
+  %1 = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %1
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll
new file mode 100644
index 0000000000000..f75707fc91af3
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect-half.ll
@@ -0,0 +1,975 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv8.6a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=BFLOAT-F16-NATIVE %s
+; RUN: llc < %s -mtriple=armv8.2a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=F16-NATIVE %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+
+define half @ct_half(i1 %cond, half %a, half %b) {
+; CT-LABEL: ct_half:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_half:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    and r3, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r12, r3, #0
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r1, r12
+; BFLOAT-F16-NATIVE-NEXT:    bic r12, r2, r12
+; BFLOAT-F16-NATIVE-NEXT:    orr r0, r0, r12
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_half:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    and r3, r0, #1
+; F16-NATIVE-NEXT:    rsb r12, r3, #0
+; F16-NATIVE-NEXT:    and r0, r1, r12
+; F16-NATIVE-NEXT:    bic r12, r2, r12
+; F16-NATIVE-NEXT:    orr r0, r0, r12
+; F16-NATIVE-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_half:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_half:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b)
+  ret half %sel
+}
+
+define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; CT-LABEL: ct_bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .pad #4
+; BFLOAT-F16-NATIVE-NEXT:    sub sp, sp, #4
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r12, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    and r3, r1, r12
+; BFLOAT-F16-NATIVE-NEXT:    bic r12, r2, r12
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r3, r12
+; BFLOAT-F16-NATIVE-NEXT:    strh r3, [sp, #2]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r0, [sp, #2]
+; BFLOAT-F16-NATIVE-NEXT:    add sp, sp, #4
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    and r3, r0, #1
+; F16-NATIVE-NEXT:    rsb r12, r3, #0
+; F16-NATIVE-NEXT:    and r0, r1, r12
+; F16-NATIVE-NEXT:    bic r12, r2, r12
+; F16-NATIVE-NEXT:    orr r0, r0, r12
+; F16-NATIVE-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b)
+  ret bfloat %sel
+}
+
+define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) {
+; CT-LABEL: ct_v4f16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, lr}
+; CT-NEXT:    push {r4, r5, r6, lr}
+; CT-NEXT:    ldrh r1, [sp, #20]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #16]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    ldrh lr, [sp, #28]
+; CT-NEXT:    orr r1, r4, r1, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #24]
+; CT-NEXT:    ldrh r5, [sp, #32]
+; CT-NEXT:    vmov d17, r2, r1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    orr r6, r6, lr, lsl #16
+; CT-NEXT:    orr r3, r5, r12, lsl #16
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vmov d16, r6, r3
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov.u16 r0, d18[0]
+; CT-NEXT:    vmov.u16 r1, d18[1]
+; CT-NEXT:    vmov.u16 r2, d18[2]
+; CT-NEXT:    vmov.u16 r3, d18[3]
+; CT-NEXT:    pop {r4, r5, r6, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v4f16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
+; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r2, r1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 d19, r1
+; BFLOAT-F16-NATIVE-NEXT:    vmov d16, r6, r3
+; BFLOAT-F16-NATIVE-NEXT:    vand d18, d17, d19
+; BFLOAT-F16-NATIVE-NEXT:    vbic d19, d16, d19
+; BFLOAT-F16-NATIVE-NEXT:    vorr d18, d18, d19
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
+; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; F16-NATIVE-LABEL: ct_v4f16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
+; F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; F16-NATIVE-NEXT:    vmov d17, r2, r1
+; F16-NATIVE-NEXT:    rsb r1, r0, #0
+; F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; F16-NATIVE-NEXT:    vdup.32 d19, r1
+; F16-NATIVE-NEXT:    vmov d16, r6, r3
+; F16-NATIVE-NEXT:    vand d18, d17, d19
+; F16-NATIVE-NEXT:    vbic d19, d16, d19
+; F16-NATIVE-NEXT:    vorr d18, d18, d19
+; F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB1-LABEL: ct_v4f16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4f16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b)
+  ret <4 x half> %sel
+}
+
+define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CT-LABEL: ct_v4bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, lr}
+; CT-NEXT:    push {r4, r5, r6, lr}
+; CT-NEXT:    ldrh r1, [sp, #20]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #16]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    ldrh lr, [sp, #28]
+; CT-NEXT:    orr r1, r4, r1, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #24]
+; CT-NEXT:    ldrh r5, [sp, #32]
+; CT-NEXT:    vmov d17, r2, r1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    orr r6, r6, lr, lsl #16
+; CT-NEXT:    orr r3, r5, r12, lsl #16
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vmov d16, r6, r3
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov.u16 r0, d18[0]
+; CT-NEXT:    vmov.u16 r1, d18[1]
+; CT-NEXT:    vmov.u16 r2, d18[2]
+; CT-NEXT:    vmov.u16 r3, d18[3]
+; CT-NEXT:    pop {r4, r5, r6, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v4bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    vldr d16, [sp]
+; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r2, r3
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 d19, r1
+; BFLOAT-F16-NATIVE-NEXT:    vand d18, d17, d19
+; BFLOAT-F16-NATIVE-NEXT:    vbic d19, d16, d19
+; BFLOAT-F16-NATIVE-NEXT:    vorr d18, d18, d19
+; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d18
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_v4bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
+; F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; F16-NATIVE-NEXT:    vmov d17, r2, r1
+; F16-NATIVE-NEXT:    rsb r1, r0, #0
+; F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; F16-NATIVE-NEXT:    vdup.32 d19, r1
+; F16-NATIVE-NEXT:    vmov d16, r6, r3
+; F16-NATIVE-NEXT:    vand d18, d17, d19
+; F16-NATIVE-NEXT:    vbic d19, d16, d19
+; F16-NATIVE-NEXT:    vorr d18, d18, d19
+; F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB1-LABEL: ct_v4bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %sel
+}
+
+define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) {
+; CT-LABEL: ct_v8f16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #32]
+; CT-NEXT:    and r1, r1, #1
+; CT-NEXT:    ldrh r3, [sp, #52]
+; CT-NEXT:    vmov.32 d16[0], r2
+; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    orr r7, r7, r12, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #68]
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r7
+; CT-NEXT:    ldrh r7, [sp, #64]
+; CT-NEXT:    ldrh r3, [sp, #28]
+; CT-NEXT:    vmov.32 d18[0], r2
+; CT-NEXT:    ldrh r2, [sp, #24]
+; CT-NEXT:    orr r7, r7, r5, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #76]
+; CT-NEXT:    vmov.32 d19[0], r7
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #72]
+; CT-NEXT:    ldrh lr, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r2
+; CT-NEXT:    orr r2, r7, r5, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #56]
+; CT-NEXT:    ldrh r8, [sp, #44]
+; CT-NEXT:    vmov.32 d19[1], r2
+; CT-NEXT:    orr r2, r4, lr, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #40]
+; CT-NEXT:    vmov.32 d18[1], r2
+; CT-NEXT:    orr r2, r6, r8, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r2
+; CT-NEXT:    rsb r2, r1, #0
+; CT-NEXT:    vdup.32 q11, r2
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v8f16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; BFLOAT-F16-NATIVE-NEXT:    and r1, r1, #1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; BFLOAT-F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; BFLOAT-F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    rsb r2, r1, #0
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 q11, r2
+; BFLOAT-F16-NATIVE-NEXT:    vand q10, q8, q11
+; BFLOAT-F16-NATIVE-NEXT:    vbic q11, q9, q11
+; BFLOAT-F16-NATIVE-NEXT:    vorr q10, q10, q11
+; BFLOAT-F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; F16-NATIVE-LABEL: ct_v8f16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; F16-NATIVE-NEXT:    and r1, r1, #1
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
+; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; F16-NATIVE-NEXT:    rsb r2, r1, #0
+; F16-NATIVE-NEXT:    vdup.32 q11, r2
+; F16-NATIVE-NEXT:    vand q10, q8, q11
+; F16-NATIVE-NEXT:    vbic q11, q9, q11
+; F16-NATIVE-NEXT:    vorr q10, q10, q11
+; F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; THUMB1-LABEL: ct_v8f16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r5, r3
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    ands r5, r6
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r3, r2
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v8f16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b)
+  ret <8 x half> %sel
+}
+
+define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CT-LABEL: ct_v8bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #32]
+; CT-NEXT:    and r1, r1, #1
+; CT-NEXT:    ldrh r3, [sp, #52]
+; CT-NEXT:    vmov.32 d16[0], r2
+; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    orr r7, r7, r12, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #68]
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r7
+; CT-NEXT:    ldrh r7, [sp, #64]
+; CT-NEXT:    ldrh r3, [sp, #28]
+; CT-NEXT:    vmov.32 d18[0], r2
+; CT-NEXT:    ldrh r2, [sp, #24]
+; CT-NEXT:    orr r7, r7, r5, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #76]
+; CT-NEXT:    vmov.32 d19[0], r7
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #72]
+; CT-NEXT:    ldrh lr, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r2
+; CT-NEXT:    orr r2, r7, r5, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #56]
+; CT-NEXT:    ldrh r8, [sp, #44]
+; CT-NEXT:    vmov.32 d19[1], r2
+; CT-NEXT:    orr r2, r4, lr, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #40]
+; CT-NEXT:    vmov.32 d18[1], r2
+; CT-NEXT:    orr r2, r6, r8, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r2
+; CT-NEXT:    rsb r2, r1, #0
+; CT-NEXT:    vdup.32 q11, r2
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v8bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    add r1, sp, #8
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    vld1.64 {d18, d19}, [r1]
+; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    vldr d17, [sp]
+; BFLOAT-F16-NATIVE-NEXT:    vmov d16, r2, r3
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 q11, r1
+; BFLOAT-F16-NATIVE-NEXT:    vand q10, q8, q11
+; BFLOAT-F16-NATIVE-NEXT:    vbic q11, q9, q11
+; BFLOAT-F16-NATIVE-NEXT:    vorr q10, q10, q11
+; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d20
+; BFLOAT-F16-NATIVE-NEXT:    vmov r2, r3, d21
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_v8bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; F16-NATIVE-NEXT:    and r1, r1, #1
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
+; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; F16-NATIVE-NEXT:    rsb r2, r1, #0
+; F16-NATIVE-NEXT:    vdup.32 q11, r2
+; F16-NATIVE-NEXT:    vand q10, q8, q11
+; F16-NATIVE-NEXT:    vbic q11, q9, q11
+; F16-NATIVE-NEXT:    vorr q10, q10, q11
+; F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; THUMB1-LABEL: ct_v8bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r5, r3
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    ands r5, r6
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r3, r2
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v8bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %sel
+}
diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll
new file mode 100644
index 0000000000000..c410f78b24c0e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll
@@ -0,0 +1,2179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+
+define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) {
+; CT-LABEL: ct_v8i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v8i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrb r12, [sp, #68]
+; DEFAULT-NEXT:    ldrb r1, [sp, #36]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r4, r1, r5
+; DEFAULT-NEXT:    bic r5, r12, r5
+; DEFAULT-NEXT:    orr r4, r4, r5
+; DEFAULT-NEXT:    ldrb r12, [sp, #64]
+; DEFAULT-NEXT:    ldrb r5, [sp, #32]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #7]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #60]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #28]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #6]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #56]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #24]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #5]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #52]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #20]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #4]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #48]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #16]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #3]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r1, [sp, #44]
+; DEFAULT-NEXT:    strb r4, [r0, #2]
+; DEFAULT-NEXT:    rsb r4, lr, #0
+; DEFAULT-NEXT:    and r5, r3, r4
+; DEFAULT-NEXT:    bic r4, r1, r4
+; DEFAULT-NEXT:    orr r5, r5, r4
+; DEFAULT-NEXT:    ldrb r1, [sp, #40]
+; DEFAULT-NEXT:    strb r5, [r0, #1]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r3, r2, r5
+; DEFAULT-NEXT:    bic r5, r1, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strb r3, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v8i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #7]
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #6]
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #5]
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #4]
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #3]
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r5, r3
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    ands r5, r6
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    strb r5, [r0, #1]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r3, r2
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    strb r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v8i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    ldrb.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #7]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #60]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #6]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #56]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #5]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #52]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #4]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #48]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #3]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r1, [sp, #44]
+; THUMB2-NEXT:    strb r4, [r0, #2]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r1, [sp, #40]
+; THUMB2-NEXT:    strb r5, [r0, #1]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strb r3, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %sel
+}
+
+define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) {
+; CT-LABEL: ct_v4i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldrh r1, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldrh r2, [sp, #28]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    ldrh r3, [sp, #16]
+; DEFAULT-NEXT:    ldrh lr, [sp, #32]
+; DEFAULT-NEXT:    and r2, r3, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    ldrh lr, [sp, #36]
+; DEFAULT-NEXT:    orr r2, r2, r4
+; DEFAULT-NEXT:    ldrh r4, [sp, #20]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v4i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %sel
+}
+
+define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) {
+; CT-LABEL: ct_v2i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_v2i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %sel
+}
+
+define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) {
+; CT-LABEL: ct_v1i64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v1i64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_v1i64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %sel
+}
+
+define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) {
+; CT-LABEL: ct_v2f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_v2f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b)
+  ret <2 x float> %sel
+}
+
+define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+; CT-LABEL: ct_v16i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v16i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrb r12, [sp, #132]
+; DEFAULT-NEXT:    ldrb r1, [sp, #68]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r4, r1, r5
+; DEFAULT-NEXT:    bic r5, r12, r5
+; DEFAULT-NEXT:    orr r4, r4, r5
+; DEFAULT-NEXT:    ldrb r12, [sp, #128]
+; DEFAULT-NEXT:    ldrb r5, [sp, #64]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #15]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #124]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #60]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #14]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #120]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #56]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #13]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #116]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #52]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #12]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #112]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #48]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #11]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #108]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #44]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #10]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #104]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #40]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #9]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #100]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #36]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #8]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #96]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #32]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #7]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #92]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #28]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #6]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #88]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #24]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #5]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #84]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #20]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #4]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrb r12, [sp, #80]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r5, [sp, #16]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strb r4, [r0, #3]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrb r1, [sp, #76]
+; DEFAULT-NEXT:    strb r4, [r0, #2]
+; DEFAULT-NEXT:    rsb r4, lr, #0
+; DEFAULT-NEXT:    and r5, r3, r4
+; DEFAULT-NEXT:    bic r4, r1, r4
+; DEFAULT-NEXT:    orr r5, r5, r4
+; DEFAULT-NEXT:    ldrb r1, [sp, #72]
+; DEFAULT-NEXT:    strb r5, [r0, #1]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r3, r2, r5
+; DEFAULT-NEXT:    bic r5, r1, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strb r3, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v16i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    ldr r1, [sp, #140]
+; THUMB1-NEXT:    ldr r5, [sp, #76]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #15]
+; THUMB1-NEXT:    ldr r1, [sp, #136]
+; THUMB1-NEXT:    ldr r5, [sp, #72]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #14]
+; THUMB1-NEXT:    ldr r1, [sp, #132]
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #13]
+; THUMB1-NEXT:    ldr r1, [sp, #128]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #12]
+; THUMB1-NEXT:    ldr r1, [sp, #124]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #11]
+; THUMB1-NEXT:    ldr r1, [sp, #120]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #10]
+; THUMB1-NEXT:    ldr r1, [sp, #116]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #9]
+; THUMB1-NEXT:    ldr r1, [sp, #112]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #8]
+; THUMB1-NEXT:    ldr r1, [sp, #108]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #7]
+; THUMB1-NEXT:    ldr r1, [sp, #104]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #6]
+; THUMB1-NEXT:    ldr r1, [sp, #100]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #5]
+; THUMB1-NEXT:    ldr r1, [sp, #96]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #4]
+; THUMB1-NEXT:    ldr r1, [sp, #92]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #3]
+; THUMB1-NEXT:    ldr r1, [sp, #88]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strb r6, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #84]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r5, r3
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    ands r5, r6
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    strb r5, [r0, #1]
+; THUMB1-NEXT:    ldr r1, [sp, #80]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r3, r2
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    strb r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v16i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #132]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #68]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    ldrb.w r12, [sp, #128]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #64]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #15]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #124]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #60]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #14]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #120]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #56]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #13]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #116]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #52]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #12]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #112]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #48]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #11]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #108]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #44]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #10]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #104]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #40]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #9]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #100]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #36]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #8]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #96]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #7]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #92]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #6]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #88]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #5]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #84]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #4]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrb.w r12, [sp, #80]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strb r4, [r0, #3]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrb.w r1, [sp, #76]
+; THUMB2-NEXT:    strb r4, [r0, #2]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r1, [sp, #72]
+; THUMB2-NEXT:    strb r5, [r0, #1]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strb r3, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %sel
+}
+
+define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+; CT-LABEL: ct_v8i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v8i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrh r12, [sp, #68]
+; DEFAULT-NEXT:    ldrh r1, [sp, #36]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r4, r1, r5
+; DEFAULT-NEXT:    bic r5, r12, r5
+; DEFAULT-NEXT:    orr r4, r4, r5
+; DEFAULT-NEXT:    ldrh r12, [sp, #64]
+; DEFAULT-NEXT:    ldrh r5, [sp, #32]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strh r4, [r0, #14]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrh r12, [sp, #60]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrh r5, [sp, #28]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strh r4, [r0, #12]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrh r12, [sp, #56]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrh r5, [sp, #24]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strh r4, [r0, #10]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrh r12, [sp, #52]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrh r5, [sp, #20]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strh r4, [r0, #8]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    ldrh r12, [sp, #48]
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrh r5, [sp, #16]
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    strh r4, [r0, #6]
+; DEFAULT-NEXT:    and r4, r5, r1
+; DEFAULT-NEXT:    bic r1, r12, r1
+; DEFAULT-NEXT:    orr r4, r4, r1
+; DEFAULT-NEXT:    ldrh r1, [sp, #44]
+; DEFAULT-NEXT:    strh r4, [r0, #4]
+; DEFAULT-NEXT:    rsb r4, lr, #0
+; DEFAULT-NEXT:    and r5, r3, r4
+; DEFAULT-NEXT:    bic r4, r1, r4
+; DEFAULT-NEXT:    orr r5, r5, r4
+; DEFAULT-NEXT:    ldrh r1, [sp, #40]
+; DEFAULT-NEXT:    strh r5, [r0, #2]
+; DEFAULT-NEXT:    rsb r5, lr, #0
+; DEFAULT-NEXT:    and r3, r2, r5
+; DEFAULT-NEXT:    bic r5, r1, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strh r3, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v8i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r6, r5
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    ands r6, r7
+; THUMB1-NEXT:    eors r6, r1
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r5, r3
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    ands r5, r6
+; THUMB1-NEXT:    eors r5, r1
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r3, r2
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    eors r3, r1
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v8i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %sel
+}
+
+define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; CT-LABEL: ct_v4i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    and r2, r3, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    orr r2, r2, r4
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v4i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; CT-LABEL: ct_v2i64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    and r2, r3, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    orr r2, r2, r4
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v2i64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v2i64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %sel
+}
+
+define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; CT-LABEL: ct_v4f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    and r2, r3, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    orr r2, r2, r4
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v4f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %sel
+}
+
+define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; CT-LABEL: ct_v2f64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2f64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    and r2, r3, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    orr r2, r2, r4
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v2f64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r5
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r3
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v2f64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %sel
+}
+
+;
+; itty bitty vector type edge cases follow. these should be scalarised.
+;
+define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) {
+; CT-LABEL: ct_v1i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b)
+  ret <1 x i8> %sel
+}
+
+define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) {
+; CT-LABEL: ct_v2i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r1, lr
+; DEFAULT-NEXT:    bic lr, r3, lr
+; DEFAULT-NEXT:    ldrb r3, [sp, #8]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r2, lr
+; DEFAULT-NEXT:    bic lr, r3, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r3
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r3
+; THUMB1-NEXT:    ldr r3, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r2
+; THUMB1-NEXT:    eors r1, r3
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r3
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_v2i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    ldrb.w r3, [sp, #8]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b)
+  ret <2 x i8> %sel
+}
+
+define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) {
+; CT-LABEL: ct_v4i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldrb lr, [sp, #20]
+; DEFAULT-NEXT:    rsb r4, r12, #0
+; DEFAULT-NEXT:    rsb r5, r12, #0
+; DEFAULT-NEXT:    and r0, r1, r4
+; DEFAULT-NEXT:    bic r4, lr, r4
+; DEFAULT-NEXT:    orr r0, r0, r4
+; DEFAULT-NEXT:    ldrb r4, [sp, #24]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r2, lr
+; DEFAULT-NEXT:    bic lr, r4, lr
+; DEFAULT-NEXT:    ldrb r4, [sp, #28]
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    bic lr, r4, lr
+; DEFAULT-NEXT:    orr r2, r2, lr
+; DEFAULT-NEXT:    ldrb r4, [sp, #16]
+; DEFAULT-NEXT:    ldrb lr, [sp, #32]
+; DEFAULT-NEXT:    and r3, r4, r5
+; DEFAULT-NEXT:    bic r5, lr, r5
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v4i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r5
+; THUMB1-NEXT:    ands r0, r6
+; THUMB1-NEXT:    eors r0, r5
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r1, r2
+; THUMB1-NEXT:    eors r1, r5
+; THUMB1-NEXT:    ands r1, r6
+; THUMB1-NEXT:    eors r1, r5
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    mov r6, r4
+; THUMB1-NEXT:    lsls r6, r6, #31
+; THUMB1-NEXT:    asrs r6, r6, #31
+; THUMB1-NEXT:    mov r2, r3
+; THUMB1-NEXT:    eors r2, r5
+; THUMB1-NEXT:    ands r2, r6
+; THUMB1-NEXT:    eors r2, r5
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    mov r7, r4
+; THUMB1-NEXT:    lsls r7, r7, #31
+; THUMB1-NEXT:    asrs r7, r7, #31
+; THUMB1-NEXT:    mov r3, r6
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    ands r3, r7
+; THUMB1-NEXT:    eors r3, r5
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; THUMB2-LABEL: ct_v4i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrb.w lr, [sp, #20]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r0, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r4, lr
+; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    bic.w lr, r4, lr
+; THUMB2-NEXT:    orr.w r2, r2, lr
+; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
+; THUMB2-NEXT:    ldrb.w lr, [sp, #32]
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b)
+  ret <4 x i8> %sel
+}
+
+define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) {
+; CT-LABEL: ct_v1i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b)
+  ret <1 x i16> %sel
+}
+
+define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) {
+; CT-LABEL: ct_v2i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r1, lr
+; DEFAULT-NEXT:    bic lr, r3, lr
+; DEFAULT-NEXT:    ldrh r3, [sp, #8]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r2, lr
+; DEFAULT-NEXT:    bic lr, r3, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r3
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r3
+; THUMB1-NEXT:    ldr r3, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r2
+; THUMB1-NEXT:    eors r1, r3
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r3
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_v2i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b)
+  ret <2 x i16> %sel
+}
+
+define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) {
+; CT-LABEL: ct_v1i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %sel
+}
+
+define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) {
+; CT-LABEL: ct_v1f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vmov s0, r2
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vmov s2, r1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov r3, s0
+; CT-NEXT:    vmov r2, s2
+; CT-NEXT:    and r2, r2, r1
+; CT-NEXT:    bic r1, r3, r1
+; CT-NEXT:    orr r2, r2, r1
+; CT-NEXT:    vmov s4, r2
+; CT-NEXT:    vmov r0, s4
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b)
+  ret <1 x float> %sel
+}
diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll
new file mode 100644
index 0000000000000..40e17cb135627
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect.ll
@@ -0,0 +1,555 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEXA9 %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEX-NOTHUMB %s
+
+define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
+; CT-LABEL: ct_i1:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_i1:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_i1:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_i1:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_i1:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_i1:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %sel
+}
+
+define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) {
+; CT-LABEL: ct_int8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int8:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int8:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %sel
+}
+
+define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) {
+; CT-LABEL: ct_int16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int16:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int16:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %sel
+}
+
+define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) {
+; CT-LABEL: ct_int32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int32:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int32:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %sel
+}
+
+define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
+; CT-LABEL: ct_int64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, lr}
+; CT-NEXT:    push {r4, lr}
+; CT-NEXT:    and lr, r0, #1
+; CT-NEXT:    ldr r12, [sp, #12]
+; CT-NEXT:    rsb r4, lr, #0
+; CT-NEXT:    ldr r1, [sp, #8]
+; CT-NEXT:    and r0, r2, r4
+; CT-NEXT:    rsb r2, lr, #0
+; CT-NEXT:    bic r4, r1, r4
+; CT-NEXT:    and r1, r3, r2
+; CT-NEXT:    bic r2, r12, r2
+; CT-NEXT:    orr r0, r0, r4
+; CT-NEXT:    orr r1, r1, r2
+; CT-NEXT:    pop {r4, pc}
+;
+; DEFAULT-LABEL: ct_int64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_int64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_int64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+;
+; CORTEXA9-LABEL: ct_int64:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    .save {r4, lr}
+; CORTEXA9-NEXT:    push {r4, lr}
+; CORTEXA9-NEXT:    and lr, r0, #1
+; CORTEXA9-NEXT:    ldrd r1, r12, [sp, #8]
+; CORTEXA9-NEXT:    rsb.w r4, lr, #0
+; CORTEXA9-NEXT:    and.w r0, r2, r4
+; CORTEXA9-NEXT:    rsb.w r2, lr, #0
+; CORTEXA9-NEXT:    bic.w r4, r1, r4
+; CORTEXA9-NEXT:    and.w r1, r3, r2
+; CORTEXA9-NEXT:    bic.w r2, r12, r2
+; CORTEXA9-NEXT:    orrs r0, r4
+; CORTEXA9-NEXT:    orr.w r1, r1, r2
+; CORTEXA9-NEXT:    pop {r4, pc}
+;
+; CORTEX-NOTHUMB-LABEL: ct_int64:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    .save {r4, lr}
+; CORTEX-NOTHUMB-NEXT:    push {r4, lr}
+; CORTEX-NOTHUMB-NEXT:    and lr, r0, #1
+; CORTEX-NOTHUMB-NEXT:    ldr r12, [sp, #12]
+; CORTEX-NOTHUMB-NEXT:    ldr r1, [sp, #8]
+; CORTEX-NOTHUMB-NEXT:    rsb r4, lr, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r2, r4
+; CORTEX-NOTHUMB-NEXT:    rsb r2, lr, #0
+; CORTEX-NOTHUMB-NEXT:    bic r4, r1, r4
+; CORTEX-NOTHUMB-NEXT:    and r1, r3, r2
+; CORTEX-NOTHUMB-NEXT:    bic r2, r12, r2
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r4
+; CORTEX-NOTHUMB-NEXT:    orr r1, r1, r2
+; CORTEX-NOTHUMB-NEXT:    pop {r4, pc}
+entry:
+  %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %sel
+}
+
+define float @ct_float(i1 %cond, float %a, float %b) {
+; CT-LABEL: ct_float:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vmov s0, r2
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vmov s2, r1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov r3, s0
+; CT-NEXT:    vmov r2, s2
+; CT-NEXT:    and r2, r2, r1
+; CT-NEXT:    bic r1, r3, r1
+; CT-NEXT:    orr r2, r2, r1
+; CT-NEXT:    vmov s4, r2
+; CT-NEXT:    vmov r0, s4
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_float:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_float:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_float:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_float:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r0, r0, #1
+; CORTEXA9-NEXT:    vmov r2, s0
+; CORTEXA9-NEXT:    vmov r3, s1
+; CORTEXA9-NEXT:    rsbs r1, r0, #0
+; CORTEXA9-NEXT:    ands r2, r1
+; CORTEXA9-NEXT:    bic.w r1, r3, r1
+; CORTEXA9-NEXT:    orrs r2, r1
+; CORTEXA9-NEXT:    vmov s2, r2
+; CORTEXA9-NEXT:    vmov.f32 s0, s2
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_float:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
+; CORTEX-NOTHUMB-NEXT:    vmov r2, s0
+; CORTEX-NOTHUMB-NEXT:    vmov r3, s1
+; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
+; CORTEX-NOTHUMB-NEXT:    and r2, r2, r1
+; CORTEX-NOTHUMB-NEXT:    bic r1, r3, r1
+; CORTEX-NOTHUMB-NEXT:    orr r2, r2, r1
+; CORTEX-NOTHUMB-NEXT:    vmov s2, r2
+; CORTEX-NOTHUMB-NEXT:    vmov.f32 s0, s2
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %sel
+}
+
+define double @ct_f64(i1 %cond, double %a, double %b) {
+; CT-LABEL: ct_f64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_f64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_f64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_f64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+;
+; CORTEXA9-LABEL: ct_f64:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r0, r0, #1
+; CORTEXA9-NEXT:    rsbs r1, r0, #0
+; CORTEXA9-NEXT:    vdup.32 d17, r1
+; CORTEXA9-NEXT:    vand d16, d0, d17
+; CORTEXA9-NEXT:    vbic d17, d1, d17
+; CORTEXA9-NEXT:    vorr d16, d16, d17
+; CORTEXA9-NEXT:    vorr d0, d16, d16
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_f64:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
+; CORTEX-NOTHUMB-NEXT:    vdup.32 d17, r1
+; CORTEX-NOTHUMB-NEXT:    vand d16, d0, d17
+; CORTEX-NOTHUMB-NEXT:    vbic d17, d1, d17
+; CORTEX-NOTHUMB-NEXT:    vorr d16, d16, d17
+; CORTEX-NOTHUMB-NEXT:    vorr d0, d16, d16
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %sel
+}
diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
new file mode 100644
index 0000000000000..42f460f2c598f
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
@@ -0,0 +1,451 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64
+
+; Portable edge case tests
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; M32-LABEL: test_ctselect_i1:
+; M32:       # %bb.0:
+; M32-NEXT:    xori $2, $4, 1
+; M32-NEXT:    and $1, $4, $5
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_i1:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $6, 0
+; M64-NEXT:    xori $2, $2, 1
+; M64-NEXT:    and $1, $2, $1
+; M64-NEXT:    and $2, $4, $5
+; M64-NEXT:    sll $2, $2, 0
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; M32-LABEL: test_ctselect_extremal_values:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    lui $3, 32768
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    lui $3, 32767
+; M32-NEXT:    ori $3, $3, 65535
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_extremal_values:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    lui $3, 32768
+; M64-NEXT:    andi $1, $1, 1
+; M64-NEXT:    addiu $2, $1, -1
+; M64-NEXT:    negu $1, $1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    lui $3, 32767
+; M64-NEXT:    ori $3, $3, 65535
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; M32-LABEL: test_ctselect_null_ptr:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    and $2, $1, $5
+;
+; M64-LABEL: test_ctselect_null_ptr:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $1, $4, 1
+; M64-NEXT:    dnegu $1, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    and $2, $1, $5
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; M32-LABEL: test_ctselect_function_ptr:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_function_ptr:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $1, $4, 1
+; M64-NEXT:    daddiu $2, $1, -1
+; M64-NEXT:    dnegu $1, $1
+; M64-NEXT:    and $2, $2, $6
+; M64-NEXT:    and $1, $1, $5
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; M32-LABEL: test_ctselect_ptr_cmp:
+; M32:       # %bb.0:
+; M32-NEXT:    xor $1, $4, $5
+; M32-NEXT:    sltu $1, $zero, $1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_ptr_cmp:
+; M64:       # %bb.0:
+; M64-NEXT:    xor $1, $4, $5
+; M64-NEXT:    daddiu $3, $zero, -1
+; M64-NEXT:    daddiu $2, $zero, -1
+; M64-NEXT:    movn $3, $zero, $1
+; M64-NEXT:    xor $2, $3, $2
+; M64-NEXT:    and $1, $3, $6
+; M64-NEXT:    and $2, $2, $7
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; M32-LABEL: test_ctselect_struct_ptr:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_struct_ptr:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $1, $4, 1
+; M64-NEXT:    daddiu $2, $1, -1
+; M64-NEXT:    dnegu $1, $1
+; M64-NEXT:    and $2, $2, $6
+; M64-NEXT:    and $1, $1, $5
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; M32-LABEL: test_ctselect_deeply_nested:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    lw $3, 20($sp)
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    lw $3, 16($sp)
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $5, 1
+; M32-NEXT:    negu $3, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 24($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    andi $3, $7, 1
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $6, 1
+; M32-NEXT:    lw $6, 32($sp)
+; M32-NEXT:    negu $4, $3
+; M32-NEXT:    addiu $3, $3, -1
+; M32-NEXT:    negu $5, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $5, $1
+; M32-NEXT:    lw $5, 28($sp)
+; M32-NEXT:    and $2, $2, $5
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    and $2, $3, $6
+; M32-NEXT:    and $1, $4, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_deeply_nested:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    sll $4, $9, 0
+; M64-NEXT:    sll $3, $8, 0
+; M64-NEXT:    sll $8, $11, 0
+; M64-NEXT:    lw $9, 0($sp)
+; M64-NEXT:    andi $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $1, $1, $4
+; M64-NEXT:    sll $4, $5, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    sll $5, $7, 0
+; M64-NEXT:    andi $4, $4, 1
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    andi $3, $3, 1
+; M64-NEXT:    andi $5, $5, 1
+; M64-NEXT:    negu $2, $4
+; M64-NEXT:    addiu $4, $4, -1
+; M64-NEXT:    negu $7, $3
+; M64-NEXT:    negu $6, $5
+; M64-NEXT:    addiu $5, $5, -1
+; M64-NEXT:    and $1, $2, $1
+; M64-NEXT:    sll $2, $10, 0
+; M64-NEXT:    and $2, $4, $2
+; M64-NEXT:    or $1, $1, $2
+; M64-NEXT:    addiu $2, $3, -1
+; M64-NEXT:    and $1, $7, $1
+; M64-NEXT:    and $2, $2, $8
+; M64-NEXT:    or $1, $1, $2
+; M64-NEXT:    and $2, $5, $9
+; M64-NEXT:    and $1, $6, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+ ; This test demonstrates the FStar cmovznz4 pattern using ct.select
+; Based on https://godbolt.org/z/6Kb71Ks7z
+; Shows that NoMerge flag prevents DAG optimization from introducing branches
+define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; M32-LABEL: cmovznz4_fstar_original:
+; M32:       # %bb.0: # %entry
+; M32-NEXT:    or $1, $4, $5
+; M32-NEXT:    addiu $2, $7, 16
+; M32-NEXT:    addiu $3, $6, 16
+; M32-NEXT:    addiu $4, $6, 8
+; M32-NEXT:    movz $2, $3, $1
+; M32-NEXT:    addiu $3, $7, 8
+; M32-NEXT:    movz $3, $4, $1
+; M32-NEXT:    addiu $4, $7, 24
+; M32-NEXT:    movz $7, $6, $1
+; M32-NEXT:    addiu $6, $6, 24
+; M32-NEXT:    lw $9, 4($2)
+; M32-NEXT:    lw $2, 0($2)
+; M32-NEXT:    movz $4, $6, $1
+; M32-NEXT:    lw $5, 4($7)
+; M32-NEXT:    lw $8, 4($3)
+; M32-NEXT:    lw $7, 0($7)
+; M32-NEXT:    lw $3, 0($3)
+; M32-NEXT:    lw $6, 16($sp)
+; M32-NEXT:    lw $1, 4($4)
+; M32-NEXT:    lw $4, 0($4)
+; M32-NEXT:    sw $4, 24($6)
+; M32-NEXT:    sw $1, 28($6)
+; M32-NEXT:    sw $2, 16($6)
+; M32-NEXT:    sw $9, 20($6)
+; M32-NEXT:    sw $3, 8($6)
+; M32-NEXT:    sw $8, 12($6)
+; M32-NEXT:    sw $7, 0($6)
+; M32-NEXT:    jr $ra
+; M32-NEXT:    sw $5, 4($6)
+;
+; M64-LABEL: cmovznz4_fstar_original:
+; M64:       # %bb.0: # %entry
+; M64-NEXT:    daddiu $1, $6, 8
+; M64-NEXT:    daddiu $2, $5, 8
+; M64-NEXT:    daddiu $3, $6, 16
+; M64-NEXT:    daddiu $8, $5, 16
+; M64-NEXT:    movz $1, $2, $4
+; M64-NEXT:    move $2, $6
+; M64-NEXT:    daddiu $6, $6, 24
+; M64-NEXT:    movz $3, $8, $4
+; M64-NEXT:    movz $2, $5, $4
+; M64-NEXT:    daddiu $5, $5, 24
+; M64-NEXT:    ld $1, 0($1)
+; M64-NEXT:    ld $3, 0($3)
+; M64-NEXT:    movz $6, $5, $4
+; M64-NEXT:    ld $2, 0($2)
+; M64-NEXT:    ld $4, 0($6)
+; M64-NEXT:    sd $4, 24($7)
+; M64-NEXT:    sd $3, 16($7)
+; M64-NEXT:    sd $1, 8($7)
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sd $2, 0($7)
+entry:
+  %.not.i = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %y, align 8
+  %1 = load i64, ptr %x, align 8
+  %or = select i1 %.not.i, i64 %1, i64 %0
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %2 = load i64, ptr %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx6, align 8
+  %or9 = select i1 %.not.i, i64 %3, i64 %2
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %4 = load i64, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %5 = load i64, ptr %arrayidx12, align 8
+  %or15 = select i1 %.not.i, i64 %5, i64 %4
+  %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %6 = load i64, ptr %arrayidx16, align 8
+  %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %7 = load i64, ptr %arrayidx18, align 8
+  %or21 = select i1 %.not.i, i64 %7, i64 %6
+  store i64 %or, ptr %r, align 8
+  %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %or9, ptr %arrayidx23, align 8
+  %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %or15, ptr %arrayidx24, align 8
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %or21, ptr %arrayidx25, align 8
+  ret void
+}
+
+define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; M32-LABEL: cmovznz4_builtin_ctselect:
+; M32:       # %bb.0: # %entry
+; M32-NEXT:    or $1, $4, $5
+; M32-NEXT:    lw $3, 4($7)
+; M32-NEXT:    lw $4, 4($6)
+; M32-NEXT:    sltu $1, $zero, $1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $4, $1, $4
+; M32-NEXT:    or $3, $4, $3
+; M32-NEXT:    lw $4, 16($sp)
+; M32-NEXT:    sw $3, 4($4)
+; M32-NEXT:    lw $3, 0($7)
+; M32-NEXT:    lw $5, 0($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 0($4)
+; M32-NEXT:    lw $3, 12($7)
+; M32-NEXT:    lw $5, 12($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 12($4)
+; M32-NEXT:    lw $3, 8($7)
+; M32-NEXT:    lw $5, 8($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 8($4)
+; M32-NEXT:    lw $3, 20($7)
+; M32-NEXT:    lw $5, 20($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 20($4)
+; M32-NEXT:    lw $3, 16($7)
+; M32-NEXT:    lw $5, 16($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 16($4)
+; M32-NEXT:    lw $3, 28($7)
+; M32-NEXT:    lw $5, 28($6)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    and $5, $1, $5
+; M32-NEXT:    or $3, $5, $3
+; M32-NEXT:    sw $3, 28($4)
+; M32-NEXT:    lw $3, 24($7)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    lw $3, 24($6)
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    jr $ra
+; M32-NEXT:    sw $1, 24($4)
+;
+; M64-LABEL: cmovznz4_builtin_ctselect:
+; M64:       # %bb.0: # %entry
+; M64-NEXT:    daddiu $2, $zero, -1
+; M64-NEXT:    daddiu $1, $zero, -1
+; M64-NEXT:    ld $3, 0($5)
+; M64-NEXT:    movn $2, $zero, $4
+; M64-NEXT:    ld $4, 0($6)
+; M64-NEXT:    xor $1, $2, $1
+; M64-NEXT:    and $3, $2, $3
+; M64-NEXT:    and $4, $1, $4
+; M64-NEXT:    or $3, $3, $4
+; M64-NEXT:    sd $3, 0($7)
+; M64-NEXT:    ld $3, 8($6)
+; M64-NEXT:    ld $4, 8($5)
+; M64-NEXT:    and $3, $1, $3
+; M64-NEXT:    and $4, $2, $4
+; M64-NEXT:    or $3, $4, $3
+; M64-NEXT:    sd $3, 8($7)
+; M64-NEXT:    ld $3, 16($6)
+; M64-NEXT:    ld $4, 16($5)
+; M64-NEXT:    and $3, $1, $3
+; M64-NEXT:    and $4, $2, $4
+; M64-NEXT:    or $3, $4, $3
+; M64-NEXT:    sd $3, 16($7)
+; M64-NEXT:    ld $3, 24($6)
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    ld $3, 24($5)
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sd $1, 24($7)
+entry:
+  %cmp = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %x, align 8
+  %1 = load i64, ptr %y, align 8
+  %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1)
+  store i64 %2, ptr %r, align 8
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx4, align 8
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %4 = load i64, ptr %arrayidx5, align 8
+  %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4)
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %5, ptr %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %6 = load i64, ptr %arrayidx8, align 8
+  %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %7 = load i64, ptr %arrayidx9, align 8
+  %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7)
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %8, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %9 = load i64, ptr %arrayidx12, align 8
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %10 = load i64, ptr %arrayidx13, align 8
+  %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10)
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %11, ptr %arrayidx14, align 8
+  ret void
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
new file mode 100644
index 0000000000000..8fc1af159ec17
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
@@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64
+
+; Test smin(x, 0) pattern
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; M32-LABEL: test_ctselect_smin_zero:
+; M32:       # %bb.0:
+; M32-NEXT:    sra $1, $4, 31
+; M32-NEXT:    jr $ra
+; M32-NEXT:    and $2, $1, $4
+;
+; M64-LABEL: test_ctselect_smin_zero:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    sra $2, $1, 31
+; M64-NEXT:    jr $ra
+; M64-NEXT:    and $2, $2, $1
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; M32-LABEL: test_ctselect_smax_zero:
+; M32:       # %bb.0:
+; M32-NEXT:    slti $1, $4, 1
+; M32-NEXT:    movn $4, $zero, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $4
+;
+; M64-LABEL: test_ctselect_smax_zero:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    slti $1, $2, 1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    movn $2, $zero, $1
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; M32-LABEL: test_ctselect_smin_generic:
+; M32:       # %bb.0:
+; M32-NEXT:    slt $1, $4, $5
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $5
+; M32-NEXT:    and $1, $1, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_smin_generic:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    slt $3, $2, $1
+; M64-NEXT:    xori $3, $3, 1
+; M64-NEXT:    negu $4, $3
+; M64-NEXT:    addiu $3, $3, -1
+; M64-NEXT:    and $1, $4, $1
+; M64-NEXT:    and $2, $3, $2
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; M32-LABEL: test_ctselect_smax_generic:
+; M32:       # %bb.0:
+; M32-NEXT:    slt $1, $5, $4
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $5
+; M32-NEXT:    and $1, $1, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_smax_generic:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:    slt $3, $2, $1
+; M64-NEXT:    xori $3, $3, 1
+; M64-NEXT:    negu $4, $3
+; M64-NEXT:    addiu $3, $3, -1
+; M64-NEXT:    and $2, $4, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; M32-LABEL: test_ctselect_umin_generic:
+; M32:       # %bb.0:
+; M32-NEXT:    sltu $1, $4, $5
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $5
+; M32-NEXT:    and $1, $1, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_umin_generic:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sltu $3, $2, $1
+; M64-NEXT:    xori $3, $3, 1
+; M64-NEXT:    negu $4, $3
+; M64-NEXT:    addiu $3, $3, -1
+; M64-NEXT:    and $1, $4, $1
+; M64-NEXT:    and $2, $3, $2
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; M32-LABEL: test_ctselect_umax_generic:
+; M32:       # %bb.0:
+; M32-NEXT:    sltu $1, $5, $4
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $5
+; M32-NEXT:    and $1, $1, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_umax_generic:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:    sltu $3, $2, $1
+; M64-NEXT:    xori $3, $3, 1
+; M64-NEXT:    negu $4, $3
+; M64-NEXT:    addiu $3, $3, -1
+; M64-NEXT:    and $2, $4, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; M32-LABEL: test_ctselect_abs:
+; M32:       # %bb.0:
+; M32-NEXT:    negu $1, $4
+; M32-NEXT:    sra $2, $4, 31
+; M32-NEXT:    and $1, $2, $1
+; M32-NEXT:    not $2, $2
+; M32-NEXT:    and $2, $2, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_abs:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    sra $3, $1, 31
+; M64-NEXT:    and $2, $3, $2
+; M64-NEXT:    not $3, $3
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; M32-LABEL: test_ctselect_nabs:
+; M32:       # %bb.0:
+; M32-NEXT:    sra $1, $4, 31
+; M32-NEXT:    negu $3, $4
+; M32-NEXT:    and $2, $1, $4
+; M32-NEXT:    not $1, $1
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $2, $1
+;
+; M64-LABEL: test_ctselect_nabs:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    sra $2, $1, 31
+; M64-NEXT:    and $3, $2, $1
+; M64-NEXT:    negu $1, $1
+; M64-NEXT:    not $2, $2
+; M64-NEXT:    and $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $3, $1
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; M32-LABEL: test_ctselect_sign_extend:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    sra $2, $4, 31
+;
+; M64-LABEL: test_ctselect_sign_extend:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sra $2, $1, 31
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; M32-LABEL: test_ctselect_zero_extend:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    sltu $2, $zero, $4
+;
+; M64-LABEL: test_ctselect_zero_extend:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sltu $2, $zero, $1
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_constant_folding_true:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $4
+;
+; M64-LABEL: test_ctselect_constant_folding_true:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $4, 0
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_constant_folding_false:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $5
+;
+; M64-LABEL: test_ctselect_constant_folding_false:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $5, 0
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; M32-LABEL: test_ctselect_identical_operands:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $5
+;
+; M64-LABEL: test_ctselect_identical_operands:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $5, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_inverted_condition:
+; M32:       # %bb.0:
+; M32-NEXT:    xor $1, $4, $5
+; M32-NEXT:    sltiu $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_inverted_condition:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    xor $1, $2, $1
+; M64-NEXT:    sltiu $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; M32-LABEL: test_ctselect_chain:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    lw $3, 16($sp)
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    and $1, $1, $7
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $5, 1
+; M32-NEXT:    negu $3, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 20($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $6, 1
+; M32-NEXT:    negu $3, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 24($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_chain:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $8, 0
+; M64-NEXT:    sll $4, $10, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    negu $3, $2
+; M64-NEXT:    addiu $2, $2, -1
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $9, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $1, $2
+; M64-NEXT:    sll $2, $6, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    negu $3, $2
+; M64-NEXT:    addiu $2, $2, -1
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    and $2, $2, $4
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Test for 64-bit operations (supported on all 64-bit architectures)
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; M32-LABEL: test_ctselect_i64_smin_zero:
+; M32:       # %bb.0:
+; M32-NEXT:    sra $1, $5, 31
+; M32-NEXT:    and $2, $1, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    and $3, $1, $5
+;
+; M64-LABEL: test_ctselect_i64_smin_zero:
+; M64:       # %bb.0:
+; M64-NEXT:    dsra $1, $4, 63
+; M64-NEXT:    jr $ra
+; M64-NEXT:    and $2, $1, $4
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
new file mode 100644
index 0000000000000..1e18a87ea6605
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
@@ -0,0 +1,712 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=mips64-unknown-linux-gnu -mcpu=mips64r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS64-MSA
+; RUN: llc < %s -mtriple=mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=+msa -O3 | FileCheck %s --check-prefix=MIPS32-MSA
+
+; Test 32-bit integer vector (128 bits)
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    shf.w $w0, $w2, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w2[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w2[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w2[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w2[3]
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test 16-bit integer vector (8 x i16 = 128-bit)
+define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v8i16:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.h $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.h $w2, $w2, 15
+; MIPS64-MSA-NEXT:    shf.h $w0, $w0, 27
+; MIPS64-MSA-NEXT:    shf.h $w1, $w1, 27
+; MIPS64-MSA-NEXT:    srai.h $w2, $w2, 15
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    shf.h $w0, $w2, 27
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v8i16:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.h $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 32($sp)
+; MIPS32-MSA-NEXT:    slli.h $w2, $w2, 15
+; MIPS32-MSA-NEXT:    srai.h $w2, $w2, 15
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+; MIPS32-MSA-NEXT:    shf.h $w0, $w0, 177
+; MIPS32-MSA-NEXT:    shf.h $w1, $w1, 177
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w1, $w0
+; MIPS32-MSA-NEXT:    shf.h $w0, $w2, 177
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+  %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %result
+}
+
+; Test byte vector (16 x i8 = 128-bit)
+define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v16i8:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.b $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.b $w2, $w2, 7
+; MIPS64-MSA-NEXT:    shf.b $w0, $w0, 27
+; MIPS64-MSA-NEXT:    shf.b $w1, $w1, 27
+; MIPS64-MSA-NEXT:    srai.b $w2, $w2, 7
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    bmnz.v $w0, $w1, $w2
+; MIPS64-MSA-NEXT:    shf.b $w0, $w0, 27
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v16i8:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.b $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 32($sp)
+; MIPS32-MSA-NEXT:    slli.b $w2, $w2, 7
+; MIPS32-MSA-NEXT:    srai.b $w2, $w2, 7
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+; MIPS32-MSA-NEXT:    shf.b $w0, $w0, 27
+; MIPS32-MSA-NEXT:    shf.b $w1, $w1, 27
+; MIPS32-MSA-NEXT:    bmnz.v $w1, $w0, $w2
+; MIPS32-MSA-NEXT:    shf.b $w0, $w1, 27
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+  %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %result
+}
+
+; Test 64-bit integer vector (2 x i64 = 128-bit)
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v2i64:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $5
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $7
+; MIPS64-MSA-NEXT:    fill.d $w2, $4
+; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $6
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $8
+; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w1, $w0
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w2[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w2[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v2i64:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    addiu $sp, $sp, -32
+; MIPS32-MSA-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-MSA-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-MSA-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; MIPS32-MSA-NEXT:    .cfi_offset 31, -4
+; MIPS32-MSA-NEXT:    .cfi_offset 30, -8
+; MIPS32-MSA-NEXT:    move $fp, $sp
+; MIPS32-MSA-NEXT:    .cfi_def_cfa_register 30
+; MIPS32-MSA-NEXT:    addiu $1, $zero, -16
+; MIPS32-MSA-NEXT:    and $sp, $sp, $1
+; MIPS32-MSA-NEXT:    lw $2, 56($fp)
+; MIPS32-MSA-NEXT:    lw $1, 60($fp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    sw $4, 12($sp)
+; MIPS32-MSA-NEXT:    sw $4, 4($sp)
+; MIPS32-MSA-NEXT:    ld.d $w2, 0($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    slli.d $w2, $w2, 63
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 64($fp)
+; MIPS32-MSA-NEXT:    srai.d $w2, $w2, 63
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 68($fp)
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 48($fp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 52($fp)
+; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+; MIPS32-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    shf.w $w0, $w2, 177
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+; MIPS32-MSA-NEXT:    move $sp, $fp
+; MIPS32-MSA-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; MIPS32-MSA-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    addiu $sp, $sp, 32
+  %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+; Test single-precision float vector (4 x float = 128-bit)
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v4f32:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    shf.w $w0, $w2, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4f32:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $5
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    st.w $w2, 0($4)
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+; Test double-precision float vector (2 x double = 128-bit)
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v2f64:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $5
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $7
+; MIPS64-MSA-NEXT:    fill.d $w2, $4
+; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $6
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $8
+; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w1, $w0
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w2[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w2[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v2f64:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    addiu $sp, $sp, -32
+; MIPS32-MSA-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-MSA-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-MSA-NEXT:    sw $fp, 24($sp) # 4-byte Folded Spill
+; MIPS32-MSA-NEXT:    .cfi_offset 31, -4
+; MIPS32-MSA-NEXT:    .cfi_offset 30, -8
+; MIPS32-MSA-NEXT:    move $fp, $sp
+; MIPS32-MSA-NEXT:    .cfi_def_cfa_register 30
+; MIPS32-MSA-NEXT:    addiu $1, $zero, -16
+; MIPS32-MSA-NEXT:    and $sp, $sp, $1
+; MIPS32-MSA-NEXT:    lw $2, 56($fp)
+; MIPS32-MSA-NEXT:    lw $1, 60($fp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    sw $5, 12($sp)
+; MIPS32-MSA-NEXT:    sw $5, 4($sp)
+; MIPS32-MSA-NEXT:    ld.d $w2, 0($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    slli.d $w2, $w2, 63
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 64($fp)
+; MIPS32-MSA-NEXT:    srai.d $w2, $w2, 63
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 68($fp)
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 48($fp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 52($fp)
+; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+; MIPS32-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    st.d $w2, 0($4)
+; MIPS32-MSA-NEXT:    move $sp, $fp
+; MIPS32-MSA-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+; MIPS32-MSA-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    addiu $sp, $sp, 32
+  %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+; Test with aligned loads (common case)
+define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    ld.w $w1, 0($5)
+; MIPS64-MSA-NEXT:    ld.w $w2, 0($6)
+; MIPS64-MSA-NEXT:    fill.w $w0, $1
+; MIPS64-MSA-NEXT:    slli.w $w0, $w0, 31
+; MIPS64-MSA-NEXT:    srai.w $w0, $w0, 31
+; MIPS64-MSA-NEXT:    bsel.v $w0, $w2, $w1
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    fill.w $w0, $4
+; MIPS32-MSA-NEXT:    ld.w $w1, 0($5)
+; MIPS32-MSA-NEXT:    ld.w $w2, 0($6)
+; MIPS32-MSA-NEXT:    slli.w $w0, $w0, 31
+; MIPS32-MSA-NEXT:    srai.w $w0, $w0, 31
+; MIPS32-MSA-NEXT:    bsel.v $w0, $w2, $w1
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+  %a = load <4 x i32>, ptr %p1, align 16
+  %b = load <4 x i32>, ptr %p2, align 16
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with unaligned loads (stress test)
+define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    ld.w $w1, 0($5)
+; MIPS64-MSA-NEXT:    ld.w $w2, 0($6)
+; MIPS64-MSA-NEXT:    fill.w $w0, $1
+; MIPS64-MSA-NEXT:    slli.w $w0, $w0, 31
+; MIPS64-MSA-NEXT:    srai.w $w0, $w0, 31
+; MIPS64-MSA-NEXT:    bsel.v $w0, $w2, $w1
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    fill.w $w0, $4
+; MIPS32-MSA-NEXT:    ld.w $w1, 0($5)
+; MIPS32-MSA-NEXT:    ld.w $w2, 0($6)
+; MIPS32-MSA-NEXT:    slli.w $w0, $w0, 31
+; MIPS32-MSA-NEXT:    srai.w $w0, $w0, 31
+; MIPS32-MSA-NEXT:    bsel.v $w0, $w2, $w1
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+  %a = load <4 x i32>, ptr %p1, align 4
+  %b = load <4 x i32>, ptr %p2, align 4
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with stores to verify result handling
+define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_store:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    st.w $w2, 0($9)
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_store:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 40($sp)
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    st.w $w2, 0($1)
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  store <4 x i32> %result, ptr %out, align 16
+  ret void
+}
+
+; Test chained selects (multiple conditions)
+define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_chain:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $6
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    sll $1, $5, 0
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $9
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $7
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $10
+; MIPS64-MSA-NEXT:    fill.w $w1, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $11
+; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    bsel.v $w1, $w0, $w2
+; MIPS64-MSA-NEXT:    shf.w $w0, $w1, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_chain:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    lw $2, 40($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 44($sp)
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    fill.w $w1, $5
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 48($sp)
+; MIPS32-MSA-NEXT:    slli.w $w1, $w1, 31
+; MIPS32-MSA-NEXT:    srai.w $w1, $w1, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 52($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    bsel.v $w1, $w0, $w2
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w1[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w1[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w1[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w1[3]
+  %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b)
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c)
+  ret <4 x i32> %result
+}
+
+; Test with arithmetic operations (ensure float vectors work with FP ops)
+define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) {
+; MIPS64-MSA-LABEL: test_ctselect_v4f32_arithmetic:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    fsub.w $w2, $w1, $w0
+; MIPS64-MSA-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64-MSA-NEXT:    fill.w $w1, $1
+; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+; MIPS64-MSA-NEXT:    bsel.v $w1, $w2, $w0
+; MIPS64-MSA-NEXT:    shf.w $w0, $w1, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4f32_arithmetic:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+; MIPS32-MSA-NEXT:    fsub.w $w2, $w1, $w0
+; MIPS32-MSA-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS32-MSA-NEXT:    fill.w $w1, $5
+; MIPS32-MSA-NEXT:    slli.w $w1, $w1, 31
+; MIPS32-MSA-NEXT:    srai.w $w1, $w1, 31
+; MIPS32-MSA-NEXT:    bsel.v $w1, $w2, $w0
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    st.w $w1, 0($4)
+  %sum = fadd <4 x float> %x, %y
+  %diff = fsub <4 x float> %x, %y
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff)
+  ret <4 x float> %result
+}
+
+; Test with mixed operations (load, compute, select, store)
+define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_mixed:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    ld.w $w0, 0($5)
+; MIPS64-MSA-NEXT:    ld.w $w1, 0($6)
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    addvi.w $w0, $w0, 1
+; MIPS64-MSA-NEXT:    addvi.w $w1, $w1, 2
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w1, $w0
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    st.w $w2, 0($7)
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_mixed:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    ld.w $w0, 0($5)
+; MIPS32-MSA-NEXT:    ld.w $w1, 0($6)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    addvi.w $w0, $w0, 1
+; MIPS32-MSA-NEXT:    addvi.w $w1, $w1, 2
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w1, $w0
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    st.w $w2, 0($7)
+  %a = load <4 x i32>, ptr %p1, align 16
+  %b = load <4 x i32>, ptr %p2, align 16
+  %a_plus_1 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+  %b_plus_2 = add <4 x i32> %b, <i32 2, i32 2, i32 2, i32 2>
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a_plus_1, <4 x i32> %b_plus_2)
+  store <4 x i32> %result, ptr %out, align 16
+  ret void
+}
+
+; Test with function arguments directly (no loads)
+define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_args:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    shf.w $w0, $w2, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_args:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w2[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w2[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w2[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w2[3]
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with multiple uses of result
+define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; MIPS64-MSA-LABEL: test_ctselect_v4i32_multi_use:
+; MIPS64-MSA:       # %bb.0:
+; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+; MIPS64-MSA-NEXT:    sll $1, $4, 0
+; MIPS64-MSA-NEXT:    fill.w $w2, $1
+; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS64-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS64-MSA-NEXT:    addv.w $w0, $w2, $w2
+; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-MSA-NEXT:    jr $ra
+; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+;
+; MIPS32-MSA-LABEL: test_ctselect_v4i32_multi_use:
+; MIPS32-MSA:       # %bb.0:
+; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+; MIPS32-MSA-NEXT:    fill.w $w2, $4
+; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+; MIPS32-MSA-NEXT:    lw $2, 20($sp)
+; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+; MIPS32-MSA-NEXT:    insert.w $w1[3], $2
+; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+; MIPS32-MSA-NEXT:    bsel.v $w2, $w0, $w1
+; MIPS32-MSA-NEXT:    addv.w $w0, $w2, $w2
+; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+; MIPS32-MSA-NEXT:    jr $ra
+; MIPS32-MSA-NEXT:    copy_s.w $5, $w0[3]
+  %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  %add = add <4 x i32> %sel, %sel  ; Use result twice
+  ret <4 x i32> %add
+}
+
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
new file mode 100644
index 0000000000000..22b24b33cff3c
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
@@ -0,0 +1,615 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64
+
+; Test basic ct.select functionality for scalar types
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; M32-LABEL: test_ctselect_i8:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_i8:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $6, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $5, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; M32-LABEL: test_ctselect_i16:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_i16:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $6, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $5, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_i32:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_i32:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $6, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $5, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; M32-LABEL: test_ctselect_i64:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    lw $2, 16($sp)
+; M32-NEXT:    addiu $3, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $4, $1, $6
+; M32-NEXT:    and $2, $3, $2
+; M32-NEXT:    and $1, $1, $7
+; M32-NEXT:    or $2, $4, $2
+; M32-NEXT:    lw $4, 20($sp)
+; M32-NEXT:    and $3, $3, $4
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $3, $1, $3
+;
+; M64-LABEL: test_ctselect_i64:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $1, $4, 1
+; M64-NEXT:    daddiu $2, $1, -1
+; M64-NEXT:    dnegu $1, $1
+; M64-NEXT:    and $2, $2, $6
+; M64-NEXT:    and $1, $1, $5
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; M32-LABEL: test_ctselect_ptr:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_ptr:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $1, $4, 1
+; M64-NEXT:    daddiu $2, $1, -1
+; M64-NEXT:    dnegu $1, $1
+; M64-NEXT:    and $2, $2, $6
+; M64-NEXT:    and $1, $1, $5
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_const_true:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $4
+;
+; M64-LABEL: test_ctselect_const_true:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $4, 0
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_const_false:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $5
+;
+; M64-LABEL: test_ctselect_const_false:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $5, 0
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_icmp_eq:
+; M32:       # %bb.0:
+; M32-NEXT:    xor $1, $4, $5
+; M32-NEXT:    sltu $1, $zero, $1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_icmp_eq:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    xor $1, $2, $1
+; M64-NEXT:    sltu $1, $zero, $1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_icmp_ne:
+; M32:       # %bb.0:
+; M32-NEXT:    xor $1, $4, $5
+; M32-NEXT:    sltiu $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_icmp_ne:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    xor $1, $2, $1
+; M64-NEXT:    sltiu $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_icmp_slt:
+; M32:       # %bb.0:
+; M32-NEXT:    slt $1, $4, $5
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_icmp_slt:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    slt $1, $2, $1
+; M64-NEXT:    xori $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; M32-LABEL: test_ctselect_icmp_ult:
+; M32:       # %bb.0:
+; M32-NEXT:    sltu $1, $4, $5
+; M32-NEXT:    xori $1, $1, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_icmp_ult:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $3, $7, 0
+; M64-NEXT:    sltu $1, $2, $1
+; M64-NEXT:    xori $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; M32-LABEL: test_ctselect_load:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $2, $4, 1
+; M32-NEXT:    lw $1, 0($6)
+; M32-NEXT:    addiu $3, $2, -1
+; M32-NEXT:    negu $2, $2
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 0($5)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $2, $1
+;
+; M64-LABEL: test_ctselect_load:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    lw $1, 0($6)
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    lw $3, 0($5)
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; M32-LABEL: test_ctselect_nested:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $5, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $4, 1
+; M32-NEXT:    negu $3, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 16($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_ctselect_nested:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:    sll $1, $7, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $6, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    negu $3, $2
+; M64-NEXT:    addiu $2, $2, -1
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $8, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $1, $2
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Test float (32-bit)
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; M32-LABEL: test_ctselect_f32:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    jr $ra
+; M32-NEXT:    mtc1 $1, $f0
+;
+; M64-LABEL: test_ctselect_f32:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    mfc1 $1, $f14
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    mfc1 $3, $f13
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    mtc1 $1, $f0
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test double (64-bit)
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; M32-LABEL: test_ctselect_f64:
+; M32:       # %bb.0:
+; M32-NEXT:    addiu $sp, $sp, -16
+; M32-NEXT:    .cfi_def_cfa_offset 16
+; M32-NEXT:    mtc1 $6, $f0
+; M32-NEXT:    mtc1 $7, $f1
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    lw $3, 36($sp)
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    sdc1 $f0, 8($sp)
+; M32-NEXT:    and $3, $2, $3
+; M32-NEXT:    lw $4, 12($sp)
+; M32-NEXT:    and $4, $1, $4
+; M32-NEXT:    or $3, $4, $3
+; M32-NEXT:    sw $3, 4($sp)
+; M32-NEXT:    lw $3, 32($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    lw $3, 8($sp)
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    sw $1, 0($sp)
+; M32-NEXT:    ldc1 $f0, 0($sp)
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $sp, $sp, 16
+;
+; M64-LABEL: test_ctselect_f64:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $2, $4, 1
+; M64-NEXT:    dmfc1 $1, $f14
+; M64-NEXT:    daddiu $3, $2, -1
+; M64-NEXT:    dnegu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    dmfc1 $3, $f13
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    dmtc1 $1, $f0
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+
+; Test chained float selects
+define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) {
+; M32-LABEL: test_ctselect_f32_chain:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $7
+; M32-NEXT:    and $1, $1, $6
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    andi $2, $5, 1
+; M32-NEXT:    negu $3, $2
+; M32-NEXT:    addiu $2, $2, -1
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 16($sp)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    or $1, $1, $2
+; M32-NEXT:    jr $ra
+; M32-NEXT:    mtc1 $1, $f0
+;
+; M64-LABEL: test_ctselect_f32_chain:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    mfc1 $1, $f15
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    mfc1 $3, $f14
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    negu $3, $2
+; M64-NEXT:    addiu $2, $2, -1
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    mfc1 $3, $f16
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $1, $2
+; M64-NEXT:    jr $ra
+; M64-NEXT:    mtc1 $1, $f0
+  %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c)
+  ret float %result
+}
+
+; Test with float load
+define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) {
+; M32-LABEL: test_ctselect_f32_load:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $2, $4, 1
+; M32-NEXT:    lw $1, 0($6)
+; M32-NEXT:    addiu $3, $2, -1
+; M32-NEXT:    negu $2, $2
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 0($5)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    or $1, $2, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    mtc1 $1, $f0
+;
+; M64-LABEL: test_ctselect_f32_load:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    lw $1, 0($6)
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    lw $3, 0($5)
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    mtc1 $1, $f0
+  %a = load float, ptr %p1
+  %b = load float, ptr %p2
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test with double load
+define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) {
+; M32-LABEL: test_ctselect_f64_load:
+; M32:       # %bb.0:
+; M32-NEXT:    addiu $sp, $sp, -8
+; M32-NEXT:    .cfi_def_cfa_offset 8
+; M32-NEXT:    andi $2, $4, 1
+; M32-NEXT:    lw $1, 4($6)
+; M32-NEXT:    lw $4, 4($5)
+; M32-NEXT:    addiu $3, $2, -1
+; M32-NEXT:    negu $2, $2
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    and $4, $2, $4
+; M32-NEXT:    or $1, $4, $1
+; M32-NEXT:    sw $1, 4($sp)
+; M32-NEXT:    lw $1, 0($6)
+; M32-NEXT:    and $1, $3, $1
+; M32-NEXT:    lw $3, 0($5)
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    or $1, $2, $1
+; M32-NEXT:    sw $1, 0($sp)
+; M32-NEXT:    ldc1 $f0, 0($sp)
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $sp, $sp, 8
+;
+; M64-LABEL: test_ctselect_f64_load:
+; M64:       # %bb.0:
+; M64-NEXT:    andi $2, $4, 1
+; M64-NEXT:    ld $1, 0($6)
+; M64-NEXT:    daddiu $3, $2, -1
+; M64-NEXT:    dnegu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    ld $3, 0($5)
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    dmtc1 $1, $f0
+  %a = load double, ptr %p1
+  %b = load double, ptr %p2
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test mixed with arithmetic
+define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) {
+; M32-LABEL: test_ctselect_f32_arithmetic:
+; M32:       # %bb.0:
+; M32-NEXT:    mtc1 $6, $f0
+; M32-NEXT:    mtc1 $5, $f1
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    negu $2, $1
+; M32-NEXT:    addiu $1, $1, -1
+; M32-NEXT:    add.s $f2, $f1, $f0
+; M32-NEXT:    sub.s $f0, $f1, $f0
+; M32-NEXT:    mfc1 $3, $f2
+; M32-NEXT:    and $2, $2, $3
+; M32-NEXT:    mfc1 $3, $f0
+; M32-NEXT:    and $1, $1, $3
+; M32-NEXT:    or $1, $2, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    mtc1 $1, $f0
+;
+; M64-LABEL: test_ctselect_f32_arithmetic:
+; M64:       # %bb.0:
+; M64-NEXT:    add.s $f0, $f13, $f14
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    andi $1, $1, 1
+; M64-NEXT:    negu $2, $1
+; M64-NEXT:    addiu $1, $1, -1
+; M64-NEXT:    mfc1 $3, $f0
+; M64-NEXT:    sub.s $f0, $f13, $f14
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    mfc1 $3, $f0
+; M64-NEXT:    and $1, $1, $3
+; M64-NEXT:    or $1, $2, $1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    mtc1 $1, $f0
+  %sum = fadd float %x, %y
+  %diff = fsub float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff)
+  ret float %result
+}
+
+; Declare the intrinsics
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
new file mode 100644
index 0000000000000..9a0263ad5915c
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=mipsel-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M32
+; RUN: llc < %s -mtriple=mips64el-unknown-linux-gnu -O3 | FileCheck %s --check-prefix=M64
+
+; Test 1: Basic optimizations should still work
+define i32 @test_basic_opts(i32 %x) {
+; M32-LABEL: test_basic_opts:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $4
+;
+; M64-LABEL: test_basic_opts:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $4, 0
+  %a = or i32 %x, 0
+  %b = and i32 %a, -1
+  %c = xor i32 %b, 0
+  ret i32 %c
+}
+
+; Test 2: Constant folding should work
+define i32 @test_constant_fold() {
+; M32-LABEL: test_constant_fold:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $2, $zero, 0
+;
+; M64-LABEL: test_constant_fold:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    addiu $2, $zero, 0
+  %a = xor i32 -1, -1    ; Should fold to 0
+  ret i32 %a
+}
+
+; Test 3: Protected pattern should NOT have branches
+define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) {
+; M32-LABEL: test_protected_no_branch:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    addiu $2, $1, -1
+; M32-NEXT:    negu $1, $1
+; M32-NEXT:    and $2, $2, $6
+; M32-NEXT:    and $1, $1, $5
+; M32-NEXT:    jr $ra
+; M32-NEXT:    or $2, $1, $2
+;
+; M64-LABEL: test_protected_no_branch:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $2, $4, 0
+; M64-NEXT:    sll $1, $6, 0
+; M64-NEXT:    andi $2, $2, 1
+; M64-NEXT:    addiu $3, $2, -1
+; M64-NEXT:    negu $2, $2
+; M64-NEXT:    and $1, $3, $1
+; M64-NEXT:    sll $3, $5, 0
+; M64-NEXT:    and $2, $2, $3
+; M64-NEXT:    jr $ra
+; M64-NEXT:    or $2, $2, $1
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test 4: Explicit branch should still generate branches
+define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) {
+; M32-LABEL: test_explicit_branch:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    beqz $1, $BB3_2
+; M32-NEXT:    nop
+; M32-NEXT:  # %bb.1: # %true
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $5
+; M32-NEXT:  $BB3_2: # %false
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $6
+;
+; M64-LABEL: test_explicit_branch:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $1, $4, 0
+; M64-NEXT:    andi $1, $1, 1
+; M64-NEXT:    beqz $1, .LBB3_2
+; M64-NEXT:    nop
+; M64-NEXT:  # %bb.1: # %true
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $5, 0
+; M64-NEXT:  .LBB3_2: # %false
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $6, 0
+  br i1 %cond, label %true, label %false
+true:
+  ret i32 %a
+false:
+  ret i32 %b
+}
+
+; Test 5: Regular select (not ct.select) - whatever wasm wants to do
+define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) {
+; M32-LABEL: test_regular_select:
+; M32:       # %bb.0:
+; M32-NEXT:    andi $1, $4, 1
+; M32-NEXT:    movn $6, $5, $1
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $6
+;
+; M64-LABEL: test_regular_select:
+; M64:       # %bb.0:
+; M64-NEXT:    sll $3, $4, 0
+; M64-NEXT:    sll $2, $6, 0
+; M64-NEXT:    sll $1, $5, 0
+; M64-NEXT:    andi $3, $3, 1
+; M64-NEXT:    jr $ra
+; M64-NEXT:    movn $2, $1, $3
+  %result = select i1 %cond, i32 %a, i32 %b
+  ret i32 %result
+}
+
+; Test if XOR with all-ones still gets optimized
+define i32 @test_xor_all_ones() {
+; M32-LABEL: test_xor_all_ones:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $2, $zero, 0
+;
+; M64-LABEL: test_xor_all_ones:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    addiu $2, $zero, 0
+  %xor1 = xor i32 -1, -1  ; Should optimize to 0
+  ret i32 %xor1
+}
+
+define i32 @test_xor_same_value(i32 %x) {
+; M32-LABEL: test_xor_same_value:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $2, $zero, 0
+;
+; M64-LABEL: test_xor_same_value:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    addiu $2, $zero, 0
+  %xor2 = xor i32 %x, %x  ; Should optimize to 0
+  ret i32 %xor2
+}
+
+define i32 @test_normal_ops(i32 %x) {
+; M32-LABEL: test_normal_ops:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    move $2, $4
+;
+; M64-LABEL: test_normal_ops:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    sll $2, $4, 0
+  %or1 = or i32 %x, 0      ; Should optimize to %x
+  %and1 = and i32 %or1, -1  ; Should optimize to %x
+  %xor1 = xor i32 %and1, 0  ; Should optimize to %x
+  ret i32 %xor1
+}
+
+; This simulates what the reviewer is worried about
+define i32 @test_xor_with_const_operands() {
+; M32-LABEL: test_xor_with_const_operands:
+; M32:       # %bb.0:
+; M32-NEXT:    jr $ra
+; M32-NEXT:    addiu $2, $zero, 0
+;
+; M64-LABEL: test_xor_with_const_operands:
+; M64:       # %bb.0:
+; M64-NEXT:    jr $ra
+; M64-NEXT:    addiu $2, $zero, 0
+  %a = xor i32 -1, -1
+  %b = xor i32 0, 0
+  %c = xor i32 42, 42
+  %result = or i32 %a, %b
+  %final = or i32 %result, %c
+  ret i32 %final  ; Should optimize to 0
+}
+
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
new file mode 100644
index 0000000000000..860f64c3672b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-edge-cases.ll
@@ -0,0 +1,462 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; RV64-LABEL: test_ctselect_i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a1, a0, a1
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; RV64-LABEL: test_ctselect_extremal_values:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    lui a1, 524288
+; RV64-NEXT:    addi a2, a0, -1
+; RV64-NEXT:    negw a0, a0
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    slli a0, a0, 33
+; RV64-NEXT:    srli a0, a0, 33
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_extremal_values:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    lui a1, 524288
+; RV32-NEXT:    addi a2, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; RV64-LABEL: test_ctselect_null_ptr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_null_ptr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; RV64-LABEL: test_ctselect_function_ptr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_function_ptr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_ptr_cmp:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    neg a1, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_ptr_cmp:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_struct_ptr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_struct_ptr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; RV64-LABEL: test_ctselect_deeply_nested:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw t0, 0(sp)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    andi a2, a2, 1
+; RV64-NEXT:    andi a3, a3, 1
+; RV64-NEXT:    addi t1, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a5, t1, a5
+; RV64-NEXT:    neg t1, a1
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a0, a0, a4
+; RV64-NEXT:    neg a4, a2
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a1, a1, a6
+; RV64-NEXT:    neg a6, a3
+; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    or a0, a0, a5
+; RV64-NEXT:    and a0, t1, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    and a0, a4, a0
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    and a0, a6, a0
+; RV64-NEXT:    and a1, a3, t0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_deeply_nested:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw t0, 0(sp)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    andi a2, a2, 1
+; RV32-NEXT:    andi a3, a3, 1
+; RV32-NEXT:    addi t1, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a5, t1, a5
+; RV32-NEXT:    neg t1, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a1, a1, a6
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    or a0, a0, a5
+; RV32-NEXT:    and a0, t1, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    and a0, a4, a0
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    and a0, a6, a0
+; RV32-NEXT:    and a1, a3, t0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+; This test demonstrates the FStar cmovznz4 pattern using ct.select
+; Based on https://godbolt.org/z/6Kb71Ks7z
+; Shows that NoMerge flag prevents DAG optimization from introducing branches
+define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; RV64-LABEL: cmovznz4_fstar_original:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mv a4, a1
+; RV64-NEXT:    beqz a0, .LBB7_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a4, a2
+; RV64-NEXT:  .LBB7_2: # %entry
+; RV64-NEXT:    beqz a0, .LBB7_6
+; RV64-NEXT:  # %bb.3: # %entry
+; RV64-NEXT:    addi a5, a2, 8
+; RV64-NEXT:    bnez a0, .LBB7_7
+; RV64-NEXT:  .LBB7_4:
+; RV64-NEXT:    addi a6, a1, 16
+; RV64-NEXT:    ld a4, 0(a4)
+; RV64-NEXT:    ld a5, 0(a5)
+; RV64-NEXT:    ld a6, 0(a6)
+; RV64-NEXT:    bnez a0, .LBB7_8
+; RV64-NEXT:  .LBB7_5:
+; RV64-NEXT:    addi a1, a1, 24
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    sd a4, 0(a3)
+; RV64-NEXT:    sd a5, 8(a3)
+; RV64-NEXT:    sd a6, 16(a3)
+; RV64-NEXT:    sd a0, 24(a3)
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB7_6:
+; RV64-NEXT:    addi a5, a1, 8
+; RV64-NEXT:    beqz a0, .LBB7_4
+; RV64-NEXT:  .LBB7_7: # %entry
+; RV64-NEXT:    addi a6, a2, 16
+; RV64-NEXT:    ld a4, 0(a4)
+; RV64-NEXT:    ld a5, 0(a5)
+; RV64-NEXT:    ld a6, 0(a6)
+; RV64-NEXT:    beqz a0, .LBB7_5
+; RV64-NEXT:  .LBB7_8: # %entry
+; RV64-NEXT:    addi a1, a2, 24
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    sd a4, 0(a3)
+; RV64-NEXT:    sd a5, 8(a3)
+; RV64-NEXT:    sd a6, 16(a3)
+; RV64-NEXT:    sd a0, 24(a3)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: cmovznz4_fstar_original:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    beqz a0, .LBB7_2
+; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:  .LBB7_2: # %entry
+; RV32-NEXT:    beqz a0, .LBB7_5
+; RV32-NEXT:  # %bb.3: # %entry
+; RV32-NEXT:    addi a5, a3, 8
+; RV32-NEXT:    bnez a0, .LBB7_6
+; RV32-NEXT:  .LBB7_4:
+; RV32-NEXT:    addi t0, a2, 16
+; RV32-NEXT:    j .LBB7_7
+; RV32-NEXT:  .LBB7_5:
+; RV32-NEXT:    addi a5, a2, 8
+; RV32-NEXT:    beqz a0, .LBB7_4
+; RV32-NEXT:  .LBB7_6: # %entry
+; RV32-NEXT:    addi t0, a3, 16
+; RV32-NEXT:  .LBB7_7: # %entry
+; RV32-NEXT:    lw a6, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    lw a7, 0(a5)
+; RV32-NEXT:    lw a5, 4(a5)
+; RV32-NEXT:    lw t1, 0(t0)
+; RV32-NEXT:    lw t0, 4(t0)
+; RV32-NEXT:    beqz a0, .LBB7_9
+; RV32-NEXT:  # %bb.8: # %entry
+; RV32-NEXT:    addi a2, a3, 24
+; RV32-NEXT:    j .LBB7_10
+; RV32-NEXT:  .LBB7_9:
+; RV32-NEXT:    addi a2, a2, 24
+; RV32-NEXT:  .LBB7_10: # %entry
+; RV32-NEXT:    lw a0, 0(a2)
+; RV32-NEXT:    lw a2, 4(a2)
+; RV32-NEXT:    sw a6, 0(a4)
+; RV32-NEXT:    sw a1, 4(a4)
+; RV32-NEXT:    sw a7, 8(a4)
+; RV32-NEXT:    sw a5, 12(a4)
+; RV32-NEXT:    sw t1, 16(a4)
+; RV32-NEXT:    sw t0, 20(a4)
+; RV32-NEXT:    sw a0, 24(a4)
+; RV32-NEXT:    sw a2, 28(a4)
+; RV32-NEXT:    ret
+entry:
+  %.not.i = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %y, align 8
+  %1 = load i64, ptr %x, align 8
+  %or = select i1 %.not.i, i64 %1, i64 %0
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %2 = load i64, ptr %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx6, align 8
+  %or9 = select i1 %.not.i, i64 %3, i64 %2
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %4 = load i64, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %5 = load i64, ptr %arrayidx12, align 8
+  %or15 = select i1 %.not.i, i64 %5, i64 %4
+  %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %6 = load i64, ptr %arrayidx16, align 8
+  %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %7 = load i64, ptr %arrayidx18, align 8
+  %or21 = select i1 %.not.i, i64 %7, i64 %6
+  store i64 %or, ptr %r, align 8
+  %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %or9, ptr %arrayidx23, align 8
+  %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %or15, ptr %arrayidx24, align 8
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %or21, ptr %arrayidx25, align 8
+  ret void
+}
+
+define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; RV64-LABEL: cmovznz4_builtin_ctselect:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    ld a4, 0(a2)
+; RV64-NEXT:    ld a5, 0(a1)
+; RV64-NEXT:    neg a6, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a4, a6, a4
+; RV64-NEXT:    and a5, a0, a5
+; RV64-NEXT:    or a4, a5, a4
+; RV64-NEXT:    sd a4, 0(a3)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld a5, 8(a1)
+; RV64-NEXT:    and a4, a6, a4
+; RV64-NEXT:    and a5, a0, a5
+; RV64-NEXT:    or a4, a5, a4
+; RV64-NEXT:    sd a4, 8(a3)
+; RV64-NEXT:    ld a4, 16(a2)
+; RV64-NEXT:    ld a5, 16(a1)
+; RV64-NEXT:    and a4, a6, a4
+; RV64-NEXT:    and a5, a0, a5
+; RV64-NEXT:    or a4, a5, a4
+; RV64-NEXT:    sd a4, 16(a3)
+; RV64-NEXT:    ld a2, 24(a2)
+; RV64-NEXT:    ld a1, 24(a1)
+; RV64-NEXT:    and a2, a6, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    sd a0, 24(a3)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: cmovznz4_builtin_ctselect:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    lw a1, 0(a2)
+; RV32-NEXT:    lw a5, 4(a2)
+; RV32-NEXT:    lw a6, 0(a3)
+; RV32-NEXT:    lw a7, 4(a3)
+; RV32-NEXT:    snez t0, a0
+; RV32-NEXT:    neg a0, t0
+; RV32-NEXT:    addi t0, t0, -1
+; RV32-NEXT:    and a6, a0, a6
+; RV32-NEXT:    and a1, t0, a1
+; RV32-NEXT:    and a7, a0, a7
+; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    or a1, a1, a6
+; RV32-NEXT:    or a5, a5, a7
+; RV32-NEXT:    sw a1, 0(a4)
+; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    lw a1, 8(a3)
+; RV32-NEXT:    lw a5, 8(a2)
+; RV32-NEXT:    lw a6, 12(a3)
+; RV32-NEXT:    lw a7, 12(a2)
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    and a6, a0, a6
+; RV32-NEXT:    and a7, t0, a7
+; RV32-NEXT:    or a1, a5, a1
+; RV32-NEXT:    or a5, a7, a6
+; RV32-NEXT:    sw a1, 8(a4)
+; RV32-NEXT:    sw a5, 12(a4)
+; RV32-NEXT:    lw a1, 16(a3)
+; RV32-NEXT:    lw a5, 16(a2)
+; RV32-NEXT:    lw a6, 20(a3)
+; RV32-NEXT:    lw a7, 20(a2)
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    and a6, a0, a6
+; RV32-NEXT:    and a7, t0, a7
+; RV32-NEXT:    or a1, a5, a1
+; RV32-NEXT:    or a5, a7, a6
+; RV32-NEXT:    sw a1, 16(a4)
+; RV32-NEXT:    sw a5, 20(a4)
+; RV32-NEXT:    lw a1, 24(a3)
+; RV32-NEXT:    lw a5, 24(a2)
+; RV32-NEXT:    lw a3, 28(a3)
+; RV32-NEXT:    lw a2, 28(a2)
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    and a5, t0, a5
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    and a2, t0, a2
+; RV32-NEXT:    or a1, a5, a1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sw a1, 24(a4)
+; RV32-NEXT:    sw a0, 28(a4)
+; RV32-NEXT:    ret
+entry:
+  %cmp = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %x, align 8
+  %1 = load i64, ptr %y, align 8
+  %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1)
+  store i64 %2, ptr %r, align 8
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx4, align 8
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %4 = load i64, ptr %arrayidx5, align 8
+  %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4)
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %5, ptr %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %6 = load i64, ptr %arrayidx8, align 8
+  %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %7 = load i64, ptr %arrayidx9, align 8
+  %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7)
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %8, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %9 = load i64, ptr %arrayidx12, align 8
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %10 = load i64, ptr %arrayidx13, align 8
+  %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10)
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %11, ptr %arrayidx14, align 8
+  ret void
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll
new file mode 100644
index 0000000000000..27c0d521bb631
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-patterns.ll
@@ -0,0 +1,388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test smin(x, 0) pattern
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; RV64-LABEL: test_ctselect_smin_zero:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sraiw a1, a0, 31
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_smin_zero:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a1, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    ret
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; RV64-LABEL: test_ctselect_smax_zero:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a0
+; RV64-NEXT:    sgtz a1, a1
+; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_smax_zero:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sgtz a1, a0
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    ret
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_smin_generic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a1
+; RV64-NEXT:    sext.w a3, a0
+; RV64-NEXT:    slt a2, a3, a2
+; RV64-NEXT:    addi a3, a2, -1
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_smin_generic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slt a2, a0, a1
+; RV32-NEXT:    addi a3, a2, -1
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_smax_generic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a0
+; RV64-NEXT:    sext.w a3, a1
+; RV64-NEXT:    slt a2, a3, a2
+; RV64-NEXT:    addi a3, a2, -1
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_smax_generic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slt a2, a1, a0
+; RV32-NEXT:    addi a3, a2, -1
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_umin_generic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a1
+; RV64-NEXT:    sext.w a3, a0
+; RV64-NEXT:    sltu a2, a3, a2
+; RV64-NEXT:    addi a3, a2, -1
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_umin_generic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sltu a2, a0, a1
+; RV32-NEXT:    addi a3, a2, -1
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_umax_generic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a2, a0
+; RV64-NEXT:    sext.w a3, a1
+; RV64-NEXT:    sltu a2, a3, a2
+; RV64-NEXT:    addi a3, a2, -1
+; RV64-NEXT:    neg a2, a2
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_umax_generic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sltu a2, a1, a0
+; RV32-NEXT:    addi a3, a2, -1
+; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; RV64-LABEL: test_ctselect_abs:
+; RV64:       # %bb.0:
+; RV64-NEXT:    negw a1, a0
+; RV64-NEXT:    sraiw a2, a0, 31
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_abs:
+; RV32:       # %bb.0:
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    srai a2, a0, 31
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; RV64-LABEL: test_ctselect_nabs:
+; RV64:       # %bb.0:
+; RV64-NEXT:    negw a1, a0
+; RV64-NEXT:    sraiw a2, a0, 31
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_nabs:
+; RV32:       # %bb.0:
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    srai a2, a0, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; RV64-LABEL: test_ctselect_sign_extend:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sraiw a0, a0, 31
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_sign_extend:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    ret
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; RV64-LABEL: test_ctselect_zero_extend:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_zero_extend:
+; RV32:       # %bb.0:
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    ret
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_constant_folding_true:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_constant_folding_true:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_constant_folding_false:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_constant_folding_false:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; RV64-LABEL: test_ctselect_identical_operands:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_identical_operands:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_inverted_condition:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    neg a1, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_inverted_condition:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; RV64-LABEL: test_ctselect_chain:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    andi a2, a2, 1
+; RV64-NEXT:    addi a7, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a4, a7, a4
+; RV64-NEXT:    neg a7, a1
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    neg a3, a2
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a1, a1, a5
+; RV64-NEXT:    or a0, a0, a4
+; RV64-NEXT:    and a0, a7, a0
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    and a1, a2, a6
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_chain:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    andi a2, a2, 1
+; RV32-NEXT:    addi a7, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a4, a7, a4
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    neg a3, a2
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a1, a1, a5
+; RV32-NEXT:    or a0, a0, a4
+; RV32-NEXT:    and a0, a7, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    and a1, a2, a6
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Test for 64-bit operations (supported on all 64-bit architectures)
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; RV64-LABEL: test_ctselect_i64_smin_zero:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srai a1, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i64_smin_zero:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a2, a1, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    ret
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll
new file mode 100644
index 0000000000000..014d95c3883b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback-vector-rvv.ll
@@ -0,0 +1,804 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v        -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v        -O3 | FileCheck %s --check-prefix=RV32
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvl128b -O3 | FileCheck %s --check-prefix=RV32-V128
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvl256b -O3 | FileCheck %s --check-prefix=RV64-V256
+
+
+; Basic pass-through select on nxv4i32
+define <vscale x 4 x i32> @ctsel_nxv4i32_basic(i1 %cond, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; RV64-LABEL: ctsel_nxv4i32_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 %cond, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+; Select with loads (aligned)
+define <vscale x 4 x i32> @ctsel_nxv4i32_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: ctsel_nxv4i32_load:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vl2re32.v v8, (a1)
+; RV64-NEXT:    vl2re32.v v10, (a2)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vl2re32.v v8, (a1)
+; RV32-NEXT:    vl2re32.v v10, (a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_load:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    vl2re32.v v8, (a1)
+; RV32-V128-NEXT:    vl2re32.v v10, (a2)
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_load:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    vl2re32.v v8, (a1)
+; RV64-V256-NEXT:    vl2re32.v v10, (a2)
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %a = load <vscale x 4 x i32>, ptr %p1, align 16
+  %b = load <vscale x 4 x i32>, ptr %p2, align 16
+  %r = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 %cond, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+; Mixed: do arithmetic first, then select, then store
+define void @ctsel_nxv4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+; RV64-LABEL: ctsel_nxv4i32_mixed:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vl2re32.v v8, (a1)
+; RV64-NEXT:    vl2re32.v v10, (a2)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vadd.vv v10, v10, v10
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vs2r.v v8, (a3)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_mixed:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vl2re32.v v8, (a1)
+; RV32-NEXT:    vl2re32.v v10, (a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vadd.vv v10, v10, v10
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vs2r.v v8, (a3)
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_mixed:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    vl2re32.v v8, (a1)
+; RV32-V128-NEXT:    vl2re32.v v10, (a2)
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vadd.vv v8, v8, v8
+; RV32-V128-NEXT:    vadd.vv v10, v10, v10
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    vs2r.v v8, (a3)
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_mixed:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    vl2re32.v v8, (a1)
+; RV64-V256-NEXT:    vl2re32.v v10, (a2)
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vadd.vv v8, v8, v8
+; RV64-V256-NEXT:    vadd.vv v10, v10, v10
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    vs2r.v v8, (a3)
+; RV64-V256-NEXT:    ret
+  %a = load <vscale x 4 x i32>, ptr %p1, align 16
+  %b = load <vscale x 4 x i32>, ptr %p2, align 16
+  ; avoid scalable vector constants: use %a+%a and %b+%b
+  %a2 = add <vscale x 4 x i32> %a, %a
+  %b2 = add <vscale x 4 x i32> %b, %b
+  %r  = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 %cond, <vscale x 4 x i32> %a2, <vscale x 4 x i32> %b2)
+  store <vscale x 4 x i32> %r, ptr %out, align 16
+  ret void
+}
+
+; Const-true/false fold smoke tests
+define <vscale x 4 x i32> @ctsel_nxv4i32_true(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; RV64-LABEL: ctsel_nxv4i32_true:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_true:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_true:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_true:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 true,  <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @ctsel_nxv4i32_false(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; RV64-LABEL: ctsel_nxv4i32_false:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_false:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_false:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-V128-NEXT:    vmv2r.v v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_false:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-V256-NEXT:    vmv2r.v v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 false, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %r
+}
+
+; Chain two selects to ensure masks don’t get merged away
+define <vscale x 4 x i32> @ctsel_nxv4i32_chain(i1 %c1, i1 %c2,
+; RV64-LABEL: ctsel_nxv4i32_chain:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v14, 0
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v16, a0
+; RV64-NEXT:    vmsne.vi v0, v16, 0
+; RV64-NEXT:    vmv.v.x v16, a1
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v18, v14, -1, v0
+; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v16, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v14, v14, -1, v0
+; RV64-NEXT:    vand.vv v8, v18, v8
+; RV64-NEXT:    vnot.v v16, v18
+; RV64-NEXT:    vand.vv v10, v16, v10
+; RV64-NEXT:    vnot.v v16, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vand.vv v8, v14, v8
+; RV64-NEXT:    vand.vv v10, v16, v12
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4i32_chain:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v14, 0
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vmsne.vi v0, v16, 0
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v18, v14, -1, v0
+; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v16, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmerge.vim v14, v14, -1, v0
+; RV32-NEXT:    vand.vv v8, v18, v8
+; RV32-NEXT:    vnot.v v16, v18
+; RV32-NEXT:    vand.vv v10, v16, v10
+; RV32-NEXT:    vnot.v v16, v14
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v14, v8
+; RV32-NEXT:    vand.vv v10, v16, v12
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4i32_chain:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v14, 0
+; RV32-V128-NEXT:    andi a1, a1, 1
+; RV32-V128-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v16, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v16, 0
+; RV32-V128-NEXT:    vmv.v.x v16, a1
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmerge.vim v18, v14, -1, v0
+; RV32-V128-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmsne.vi v0, v16, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmerge.vim v14, v14, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v18, v8
+; RV32-V128-NEXT:    vnot.v v16, v18
+; RV32-V128-NEXT:    vand.vv v10, v16, v10
+; RV32-V128-NEXT:    vnot.v v16, v14
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    vand.vv v8, v14, v8
+; RV32-V128-NEXT:    vand.vv v10, v16, v12
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4i32_chain:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v14, 0
+; RV64-V256-NEXT:    andi a1, a1, 1
+; RV64-V256-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v16, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v16, 0
+; RV64-V256-NEXT:    vmv.v.x v16, a1
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmerge.vim v18, v14, -1, v0
+; RV64-V256-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmsne.vi v0, v16, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmerge.vim v14, v14, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v18, v8
+; RV64-V256-NEXT:    vnot.v v16, v18
+; RV64-V256-NEXT:    vand.vv v10, v16, v10
+; RV64-V256-NEXT:    vnot.v v16, v14
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    vand.vv v8, v14, v8
+; RV64-V256-NEXT:    vand.vv v10, v16, v12
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+                                               <vscale x 4 x i32> %a,
+                                               <vscale x 4 x i32> %b,
+                                               <vscale x 4 x i32> %c) {
+  %t  = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 %c1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
+  %r  = call <vscale x 4 x i32> @llvm.ct.select.nxv4i32(i1 %c2, <vscale x 4 x i32> %t, <vscale x 4 x i32> %c)
+  ret <vscale x 4 x i32> %r
+}
+
+; A different element width
+define <vscale x 8 x i16> @ctsel_nxv8i16_basic(i1 %cond, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; RV64-LABEL: ctsel_nxv8i16_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv8i16_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv8i16_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv8i16_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 8 x i16> @llvm.ct.select.nxv8i16(i1 %cond, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %r
+}
+
+define <vscale x 16 x i8> @ctsel_nxv16i8_basic(i1 %cond, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; RV64-LABEL: ctsel_nxv16i8_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv16i8_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv16i8_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv16i8_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 16 x i8> @llvm.ct.select.nxv16i8(i1 %cond, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %r
+}
+
+; 64-bit elements (useful on RV64)
+define <vscale x 2 x i64> @ctsel_nxv2i64_basic(i1 %cond, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; RV64-LABEL: ctsel_nxv2i64_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv2i64_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv2i64_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv2i64_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 2 x i64> @llvm.ct.select.nxv2i64(i1 %cond, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %r
+}
+
+; Floating-point scalable vectors (bitcasted in your fallback)
+define <vscale x 4 x float> @ctsel_nxv4f32_basic(i1 %cond, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; RV64-LABEL: ctsel_nxv4f32_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4f32_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4f32_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4f32_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 4 x float> @llvm.ct.select.nxv4f32(i1 %cond, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %r
+}
+
+; FP arithmetic around select
+define <vscale x 4 x float> @ctsel_nxv4f32_arith(i1 %cond, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; RV64-LABEL: ctsel_nxv4f32_arith:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfadd.vv v12, v8, v10
+; RV64-NEXT:    vfsub.vv v8, v8, v10
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a0
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vmerge.vim v10, v10, -1, v0
+; RV64-NEXT:    vand.vv v12, v10, v12
+; RV64-NEXT:    vnot.v v10, v10
+; RV64-NEXT:    vand.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv4f32_arith:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfadd.vv v12, v8, v10
+; RV32-NEXT:    vfsub.vv v8, v8, v10
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmsne.vi v0, v10, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v10, 0
+; RV32-NEXT:    vmerge.vim v10, v10, -1, v0
+; RV32-NEXT:    vand.vv v12, v10, v12
+; RV32-NEXT:    vnot.v v10, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv4f32_arith:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vfadd.vv v12, v8, v10
+; RV32-V128-NEXT:    vfsub.vv v8, v8, v10
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v10, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v10, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v10, 0
+; RV32-V128-NEXT:    vmerge.vim v10, v10, -1, v0
+; RV32-V128-NEXT:    vand.vv v12, v10, v12
+; RV32-V128-NEXT:    vnot.v v10, v10
+; RV32-V128-NEXT:    vand.vv v8, v10, v8
+; RV32-V128-NEXT:    vor.vv v8, v12, v8
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv4f32_arith:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vfadd.vv v12, v8, v10
+; RV64-V256-NEXT:    vfsub.vv v8, v8, v10
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v10, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v10, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v10, 0
+; RV64-V256-NEXT:    vmerge.vim v10, v10, -1, v0
+; RV64-V256-NEXT:    vand.vv v12, v10, v12
+; RV64-V256-NEXT:    vnot.v v10, v10
+; RV64-V256-NEXT:    vand.vv v8, v10, v8
+; RV64-V256-NEXT:    vor.vv v8, v12, v8
+; RV64-V256-NEXT:    ret
+  %sum  = fadd <vscale x 4 x float> %x, %y
+  %diff = fsub <vscale x 4 x float> %x, %y
+  %r    = call <vscale x 4 x float> @llvm.ct.select.nxv4f32(i1 %cond, <vscale x 4 x float> %sum, <vscale x 4 x float> %diff)
+  ret <vscale x 4 x float> %r
+}
+
+define <vscale x 2 x double> @ctsel_nxv2f64_basic(i1 %cond, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; RV64-LABEL: ctsel_nxv2f64_basic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vmsne.vi v0, v12, 0
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v12, 0
+; RV64-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-NEXT:    vand.vv v8, v12, v8
+; RV64-NEXT:    vnot.v v12, v12
+; RV64-NEXT:    vand.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: ctsel_nxv2f64_basic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmsne.vi v0, v12, 0
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vnot.v v12, v12
+; RV32-NEXT:    vand.vv v10, v12, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    ret
+;
+; RV32-V128-LABEL: ctsel_nxv2f64_basic:
+; RV32-V128:       # %bb.0:
+; RV32-V128-NEXT:    andi a0, a0, 1
+; RV32-V128-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-V128-NEXT:    vmv.v.x v12, a0
+; RV32-V128-NEXT:    vmsne.vi v0, v12, 0
+; RV32-V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-V128-NEXT:    vmv.v.i v12, 0
+; RV32-V128-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV32-V128-NEXT:    vand.vv v8, v12, v8
+; RV32-V128-NEXT:    vnot.v v12, v12
+; RV32-V128-NEXT:    vand.vv v10, v12, v10
+; RV32-V128-NEXT:    vor.vv v8, v8, v10
+; RV32-V128-NEXT:    ret
+;
+; RV64-V256-LABEL: ctsel_nxv2f64_basic:
+; RV64-V256:       # %bb.0:
+; RV64-V256-NEXT:    andi a0, a0, 1
+; RV64-V256-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-V256-NEXT:    vmv.v.x v12, a0
+; RV64-V256-NEXT:    vmsne.vi v0, v12, 0
+; RV64-V256-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-V256-NEXT:    vmv.v.i v12, 0
+; RV64-V256-NEXT:    vmerge.vim v12, v12, -1, v0
+; RV64-V256-NEXT:    vand.vv v8, v12, v8
+; RV64-V256-NEXT:    vnot.v v12, v12
+; RV64-V256-NEXT:    vand.vv v10, v12, v10
+; RV64-V256-NEXT:    vor.vv v8, v8, v10
+; RV64-V256-NEXT:    ret
+  %r = call <vscale x 2 x double> @llvm.ct.select.nxv2f64(i1 %cond, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %r
+}
+
+declare <vscale x 4 x i32>   @llvm.ct.select.nxv4i32(i1, <vscale x 4 x i32>,   <vscale x 4 x i32>)
+declare <vscale x 8 x i16>   @llvm.ct.select.nxv8i16(i1, <vscale x 8 x i16>,   <vscale x 8 x i16>)
+declare <vscale x 16 x i8>   @llvm.ct.select.nxv16i8(i1, <vscale x 16 x i8>,   <vscale x 16 x i8>)
+declare <vscale x 2 x i64>   @llvm.ct.select.nxv2i64(i1, <vscale x 2 x i64>,   <vscale x 2 x i64>)
+declare <vscale x 4 x float> @llvm.ct.select.nxv4f32(i1, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double>@llvm.ct.select.nxv2f64(i1, <vscale x 2 x double>,<vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
new file mode 100644
index 0000000000000..1625c8db2d85c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test basic ct.select functionality for scalar types
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; RV64-LABEL: test_ctselect_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; RV64-LABEL: test_ctselect_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; RV64-LABEL: test_ctselect_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    or a0, a1, a3
+; RV32-NEXT:    or a1, a2, a4
+; RV32-NEXT:    ret
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_ptr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_ptr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_const_true:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_const_true:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_const_false:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_const_false:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    neg a1, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    neg a1, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    slt a0, a0, a1
+; RV64-NEXT:    addi a1, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slt a0, a0, a1
+; RV32-NEXT:    addi a1, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a1, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a1, a1, a3
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a1, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_load:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; RV64-LABEL: test_ctselect_nested:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a5, a1, -1
+; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    and a3, a5, a3
+; RV64-NEXT:    neg a5, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a1, a2
+; RV64-NEXT:    or a1, a1, a3
+; RV64-NEXT:    and a1, a5, a1
+; RV64-NEXT:    and a0, a0, a4
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_nested:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a5, a1, -1
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    neg a5, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    and a1, a5, a1
+; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Test float (32-bit)
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; RV64-LABEL: test_ctselect_f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test double (64-bit)
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; RV64-LABEL: test_ctselect_f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    or a0, a1, a3
+; RV32-NEXT:    or a1, a2, a4
+; RV32-NEXT:    ret
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+
+; Test chained float selects
+define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) {
+; RV64-LABEL: test_ctselect_f32_chain:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    addi a5, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a3, a5, a3
+; RV64-NEXT:    neg a5, a1
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a0, a3
+; RV64-NEXT:    and a0, a5, a0
+; RV64-NEXT:    and a1, a1, a4
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32_chain:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    and a0, a5, a0
+; RV32-NEXT:    and a1, a1, a4
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+  %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c)
+  ret float %result
+}
+
+; Test with float load
+define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_f32_load:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %a = load float, ptr %p1
+  %b = load float, ptr %p2
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test with double load
+define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_f64_load:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ld a1, 0(a1)
+; RV64-NEXT:    ld a2, 0(a2)
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a3, 0(a1)
+; RV32-NEXT:    lw a1, 4(a1)
+; RV32-NEXT:    lw a4, 0(a2)
+; RV32-NEXT:    lw a2, 4(a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    and a2, a5, a2
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    or a0, a3, a4
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    ret
+  %a = load double, ptr %p1
+  %b = load double, ptr %p2
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test mixed with arithmetic
+define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) {
+; RV64-LABEL: test_ctselect_f32_arithmetic:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -48
+; RV64-NEXT:    .cfi_def_cfa_offset 48
+; RV64-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    mv s0, a2
+; RV64-NEXT:    mv s1, a1
+; RV64-NEXT:    mv s2, a0
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    call __addsf3
+; RV64-NEXT:    mv s3, a0
+; RV64-NEXT:    mv a0, s1
+; RV64-NEXT:    mv a1, s0
+; RV64-NEXT:    call __subsf3
+; RV64-NEXT:    andi a1, s2, 1
+; RV64-NEXT:    neg a2, a1
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a2, a2, s3
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    addi sp, sp, 48
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32_arithmetic:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    mv s0, a2
+; RV32-NEXT:    mv s1, a1
+; RV32-NEXT:    mv s2, a0
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    call __addsf3
+; RV32-NEXT:    mv s3, a0
+; RV32-NEXT:    mv a0, s1
+; RV32-NEXT:    mv a1, s0
+; RV32-NEXT:    call __subsf3
+; RV32-NEXT:    andi a1, s2, 1
+; RV32-NEXT:    neg a2, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a2, a2, s3
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+  %sum = fadd float %x, %y
+  %diff = fsub float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff)
+  ret float %result
+}
+
+; Declare the intrinsics
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll
new file mode 100644
index 0000000000000..60f6350d6508d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-side-effects.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 -filetype=asm | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 -filetype=asm | FileCheck %s --check-prefix=RV32
+
+; Test 1: Basic optimizations should still work
+define i32 @test_basic_opts(i32 %x) {
+; RV64-LABEL: test_basic_opts:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_basic_opts:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+  %a = or i32 %x, 0      ; Should eliminate
+  %b = and i32 %a, -1    ; Should eliminate
+  %c = xor i32 %b, 0     ; Should eliminate
+  ret i32 %c
+}
+
+; Test 2: Constant folding should work
+define i32 @test_constant_fold() {
+; RV64-LABEL: test_constant_fold:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_constant_fold:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %a = xor i32 -1, -1    ; Should fold to 0
+  ret i32 %a
+}
+
+; Test 3: Protected pattern should NOT have branches
+define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_protected_no_branch:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a3, a0, -1
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_protected_no_branch:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test 4: Explicit branch should still generate branches
+define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_explicit_branch:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    beqz a0, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %true
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB3_2: # %false
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_explicit_branch:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %true
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB3_2: # %false
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+  br i1 %cond, label %true, label %false
+true:
+  ret i32 %a
+false:
+  ret i32 %b
+}
+
+; Test 5: Regular select (not ct.select) - whatever wasm wants to do
+define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_regular_select:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_regular_select:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a3, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    ret
+  %result = select i1 %cond, i32 %a, i32 %b
+  ret i32 %result
+}
+
+; Test if XOR with all-ones still gets optimized
+define i32 @test_xor_all_ones() {
+; RV64-LABEL: test_xor_all_ones:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_xor_all_ones:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %xor1 = xor i32 -1, -1  ; Should optimize to 0
+  ret i32 %xor1
+}
+
+define i32 @test_xor_same_value(i32 %x) {
+; RV64-LABEL: test_xor_same_value:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_xor_same_value:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %xor2 = xor i32 %x, %x  ; Should optimize to 0
+  ret i32 %xor2
+}
+
+define i32 @test_normal_ops(i32 %x) {
+; RV64-LABEL: test_normal_ops:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_normal_ops:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+  %or1 = or i32 %x, 0      ; Should optimize to %x
+  %and1 = and i32 %or1, -1  ; Should optimize to %x
+  %xor1 = xor i32 %and1, 0  ; Should optimize to %x
+  ret i32 %xor1
+}
+
+; This simulates what the reviewer is worried about
+define i32 @test_xor_with_const_operands() {
+; RV64-LABEL: test_xor_with_const_operands:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_xor_with_const_operands:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %a = xor i32 -1, -1      ; -1 ^ -1 should become 0
+  %b = xor i32 0, 0        ; 0 ^ 0 should become 0
+  %c = xor i32 42, 42      ; 42 ^ 42 should become 0
+  %result = or i32 %a, %b
+  %final = or i32 %result, %c
+  ret i32 %final  ; Should optimize to 0
+}
+
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll
new file mode 100644
index 0000000000000..19f01b37ba8cb
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-edge-cases.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; W32-LABEL: test_ctselect_i1:
+; W32:         .functype test_ctselect_i1 (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i1:
+; W64:         .functype test_ctselect_i1 (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; W32-LABEL: test_ctselect_extremal_values:
+; W32:         .functype test_ctselect_extremal_values (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 2147483647
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    i32.const -2147483648
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_extremal_values:
+; W64:         .functype test_ctselect_extremal_values (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 2147483647
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    i32.const -2147483648
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; W32-LABEL: test_ctselect_null_ptr:
+; W32:         .functype test_ctselect_null_ptr (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_null_ptr:
+; W64:         .functype test_ctselect_null_ptr (i32, i64) -> (i64)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    # fallthrough-return
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; W32-LABEL: test_ctselect_function_ptr:
+; W32:         .functype test_ctselect_function_ptr (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_function_ptr:
+; W64:         .functype test_ctselect_function_ptr (i32, i64, i64) -> (i64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    # fallthrough-return
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; W32-LABEL: test_ctselect_ptr_cmp:
+; W32:         .functype test_ctselect_ptr_cmp (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.eq
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_ptr_cmp:
+; W64:         .functype test_ctselect_ptr_cmp (i64, i64, i64, i64) -> (i64)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.eq
+; W64-NEXT:    i64.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; W32-LABEL: test_ctselect_struct_ptr:
+; W32:         .functype test_ctselect_struct_ptr (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_struct_ptr:
+; W64:         .functype test_ctselect_struct_ptr (i32, i64, i64) -> (i64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    # fallthrough-return
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; W32-LABEL: test_ctselect_deeply_nested:
+; W32:         .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 3
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 5
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 6
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 7
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 8
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_deeply_nested:
+; W64:         .functype test_ctselect_deeply_nested (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 5
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 6
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 7
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 8
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+; This test demonstrates the FStar cmovznz4 pattern using ct.select
+; Based on https://godbolt.org/z/6Kb71Ks7z
+; Shows that NoMerge flag prevents DAG optimization from introducing branches
+define void @cmovznz4_fstar_original(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; W32-LABEL: cmovznz4_fstar_original:
+; W32:         .functype cmovznz4_fstar_original (i64, i32, i32, i32) -> ()
+; W32-NEXT:    .local i32, i64, i64
+; W32-NEXT:  # %bb.0: # %entry
+; W32-NEXT:    local.get 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.eqz
+; W32-NEXT:    local.tee 4
+; W32-NEXT:    i32.select
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    local.set 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 8
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const 8
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.select
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    local.set 5
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 16
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const 16
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.select
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    local.set 6
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 24
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const 24
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.select
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    i64.store 24
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 6
+; W32-NEXT:    i64.store 16
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 5
+; W32-NEXT:    i64.store 8
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.store 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: cmovznz4_fstar_original:
+; W64:         .functype cmovznz4_fstar_original (i64, i64, i64, i64) -> ()
+; W64-NEXT:    .local i32, i64, i64
+; W64-NEXT:  # %bb.0: # %entry
+; W64-NEXT:    local.get 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.eqz
+; W64-NEXT:    local.tee 4
+; W64-NEXT:    i64.select
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    local.set 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.const 8
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.const 8
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i64.select
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    local.set 5
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.const 16
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.const 16
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i64.select
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    local.set 6
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.const 24
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.const 24
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i64.select
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    i64.store 24
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 6
+; W64-NEXT:    i64.store 16
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 5
+; W64-NEXT:    i64.store 8
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.store 0
+; W64-NEXT:    # fallthrough-return
+entry:
+  %.not.i = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %y, align 8
+  %1 = load i64, ptr %x, align 8
+  %or = select i1 %.not.i, i64 %1, i64 %0
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %2 = load i64, ptr %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx6, align 8
+  %or9 = select i1 %.not.i, i64 %3, i64 %2
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %4 = load i64, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %5 = load i64, ptr %arrayidx12, align 8
+  %or15 = select i1 %.not.i, i64 %5, i64 %4
+  %arrayidx16 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %6 = load i64, ptr %arrayidx16, align 8
+  %arrayidx18 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %7 = load i64, ptr %arrayidx18, align 8
+  %or21 = select i1 %.not.i, i64 %7, i64 %6
+  store i64 %or, ptr %r, align 8
+  %arrayidx23 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %or9, ptr %arrayidx23, align 8
+  %arrayidx24 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %or15, ptr %arrayidx24, align 8
+  %arrayidx25 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %or21, ptr %arrayidx25, align 8
+  ret void
+}
+
+define void @cmovznz4_builtin_ctselect(i64 %cin, ptr %x, ptr %y, ptr %r) {
+; W32-LABEL: cmovznz4_builtin_ctselect:
+; W32:         .functype cmovznz4_builtin_ctselect (i64, i32, i32, i32) -> ()
+; W32-NEXT:    .local i64
+; W32-NEXT:  # %bb.0: # %entry
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i64.const -1
+; W32-NEXT:    i64.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.eqz
+; W32-NEXT:    i64.select
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.const -1
+; W32-NEXT:    i64.xor
+; W32-NEXT:    local.tee 4
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    i64.store 0
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.load 8
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 4
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.load 8
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    i64.store 8
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.load 16
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 4
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.load 16
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    i64.store 16
+; W32-NEXT:    local.get 3
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.load 24
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 4
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.load 24
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    i64.store 24
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: cmovznz4_builtin_ctselect:
+; W64:         .functype cmovznz4_builtin_ctselect (i64, i64, i64, i64) -> ()
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0: # %entry
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.eqz
+; W64-NEXT:    i64.select
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.xor
+; W64-NEXT:    local.tee 4
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    i64.store 0
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.load 8
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 4
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.load 8
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    i64.store 8
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.load 16
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 4
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.load 16
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    i64.store 16
+; W64-NEXT:    local.get 3
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.load 24
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 4
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.load 24
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    i64.store 24
+; W64-NEXT:    # fallthrough-return
+entry:
+  %cmp = icmp eq i64 %cin, 0
+  %0 = load i64, ptr %x, align 8
+  %1 = load i64, ptr %y, align 8
+  %2 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %0, i64 %1)
+  store i64 %2, ptr %r, align 8
+  %arrayidx4 = getelementptr inbounds nuw i8, ptr %x, i64 8
+  %3 = load i64, ptr %arrayidx4, align 8
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %y, i64 8
+  %4 = load i64, ptr %arrayidx5, align 8
+  %5 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %3, i64 %4)
+  %arrayidx6 = getelementptr inbounds nuw i8, ptr %r, i64 8
+  store i64 %5, ptr %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds nuw i8, ptr %x, i64 16
+  %6 = load i64, ptr %arrayidx8, align 8
+  %arrayidx9 = getelementptr inbounds nuw i8, ptr %y, i64 16
+  %7 = load i64, ptr %arrayidx9, align 8
+  %8 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %6, i64 %7)
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %r, i64 16
+  store i64 %8, ptr %arrayidx10, align 8
+  %arrayidx12 = getelementptr inbounds nuw i8, ptr %x, i64 24
+  %9 = load i64, ptr %arrayidx12, align 8
+  %arrayidx13 = getelementptr inbounds nuw i8, ptr %y, i64 24
+  %10 = load i64, ptr %arrayidx13, align 8
+  %11 = tail call i64 @llvm.ct.select.i64(i1 %cmp, i64 %9, i64 %10)
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %r, i64 24
+  store i64 %11, ptr %arrayidx14, align 8
+  ret void
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll
new file mode 100644
index 0000000000000..5c8d66249a95a
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-patterns.ll
@@ -0,0 +1,611 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64
+
+; Test smin(x, 0) pattern
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; W32-LABEL: test_ctselect_smin_zero:
+; W32:         .functype test_ctselect_smin_zero (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 31
+; W32-NEXT:    i32.shr_s
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_smin_zero:
+; W64:         .functype test_ctselect_smin_zero (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 31
+; W64-NEXT:    i32.shr_s
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; W32-LABEL: test_ctselect_smax_zero:
+; W32:         .functype test_ctselect_smax_zero (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    i32.gt_s
+; W32-NEXT:    i32.select
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_smax_zero:
+; W64:         .functype test_ctselect_smax_zero (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    i32.gt_s
+; W64-NEXT:    i32.select
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; W32-LABEL: test_ctselect_smin_generic:
+; W32:         .functype test_ctselect_smin_generic (i32, i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.lt_s
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_smin_generic:
+; W64:         .functype test_ctselect_smin_generic (i32, i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.lt_s
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; W32-LABEL: test_ctselect_smax_generic:
+; W32:         .functype test_ctselect_smax_generic (i32, i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.gt_s
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_smax_generic:
+; W64:         .functype test_ctselect_smax_generic (i32, i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.gt_s
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; W32-LABEL: test_ctselect_umin_generic:
+; W32:         .functype test_ctselect_umin_generic (i32, i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.lt_u
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_umin_generic:
+; W64:         .functype test_ctselect_umin_generic (i32, i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.lt_u
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; W32-LABEL: test_ctselect_umax_generic:
+; W32:         .functype test_ctselect_umax_generic (i32, i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.gt_u
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_umax_generic:
+; W64:         .functype test_ctselect_umax_generic (i32, i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.gt_u
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; W32-LABEL: test_ctselect_abs:
+; W32:         .functype test_ctselect_abs (i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 31
+; W32-NEXT:    i32.shr_s
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_abs:
+; W64:         .functype test_ctselect_abs (i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 31
+; W64-NEXT:    i32.shr_s
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; W32-LABEL: test_ctselect_nabs:
+; W32:         .functype test_ctselect_nabs (i32) -> (i32)
+; W32-NEXT:    .local i32
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 31
+; W32-NEXT:    i32.shr_s
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_nabs:
+; W64:         .functype test_ctselect_nabs (i32) -> (i32)
+; W64-NEXT:    .local i32
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 31
+; W64-NEXT:    i32.shr_s
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; W32-LABEL: test_ctselect_sign_extend:
+; W32:         .functype test_ctselect_sign_extend (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 31
+; W32-NEXT:    i32.shr_s
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_sign_extend:
+; W64:         .functype test_ctselect_sign_extend (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 31
+; W64-NEXT:    i32.shr_s
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; W32-LABEL: test_ctselect_zero_extend:
+; W32:         .functype test_ctselect_zero_extend (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    i32.ne
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_zero_extend:
+; W64:         .functype test_ctselect_zero_extend (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    i32.ne
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_constant_folding_true:
+; W32:         .functype test_ctselect_constant_folding_true (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_constant_folding_true:
+; W64:         .functype test_ctselect_constant_folding_true (i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_constant_folding_false:
+; W32:         .functype test_ctselect_constant_folding_false (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 1
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_constant_folding_false:
+; W64:         .functype test_ctselect_constant_folding_false (i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 1
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; W32-LABEL: test_ctselect_identical_operands:
+; W32:         .functype test_ctselect_identical_operands (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 1
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_identical_operands:
+; W64:         .functype test_ctselect_identical_operands (i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 1
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_inverted_condition:
+; W32:         .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.ne
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_inverted_condition:
+; W64:         .functype test_ctselect_inverted_condition (i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.ne
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; W32-LABEL: test_ctselect_chain:
+; W32:         .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 2
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 5
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 6
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_chain:
+; W64:         .functype test_ctselect_chain (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 2
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 5
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 6
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Test for 64-bit operations (supported on all 64-bit architectures)
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; W32-LABEL: test_ctselect_i64_smin_zero:
+; W32:         .functype test_ctselect_i64_smin_zero (i64) -> (i64)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.const 63
+; W32-NEXT:    i64.shr_s
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.and
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i64_smin_zero:
+; W64:         .functype test_ctselect_i64_smin_zero (i64) -> (i64)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.const 63
+; W64-NEXT:    i64.shr_s
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.and
+; W64-NEXT:    # fallthrough-return
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll
new file mode 100644
index 0000000000000..daa7370fb481a
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback-vector.ll
@@ -0,0 +1,566 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM32
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -mattr=+simd128 | FileCheck %s --check-prefix=WASM64
+
+; Test 32-bit integer vector (4 x i32 = 128-bit)
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; WASM32-LABEL: test_ctselect_v4i32:
+; WASM32:         .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32:
+; WASM64:         .functype test_ctselect_v4i32 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test 16-bit integer vector (8 x i16 = 128-bit)
+define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+; WASM32-LABEL: test_ctselect_v8i16:
+; WASM32:         .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i16x8.splat
+; WASM32-NEXT:    i32.const 15
+; WASM32-NEXT:    i16x8.shl
+; WASM32-NEXT:    i32.const 15
+; WASM32-NEXT:    i16x8.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v8i16:
+; WASM64:         .functype test_ctselect_v8i16 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i16x8.splat
+; WASM64-NEXT:    i32.const 15
+; WASM64-NEXT:    i16x8.shl
+; WASM64-NEXT:    i32.const 15
+; WASM64-NEXT:    i16x8.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %result
+}
+
+; Test byte vector (16 x i8 = 128-bit)
+define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+; WASM32-LABEL: test_ctselect_v16i8:
+; WASM32:         .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i8x16.splat
+; WASM32-NEXT:    i32.const 7
+; WASM32-NEXT:    i8x16.shl
+; WASM32-NEXT:    i32.const 7
+; WASM32-NEXT:    i8x16.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v16i8:
+; WASM64:         .functype test_ctselect_v16i8 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i8x16.splat
+; WASM64-NEXT:    i32.const 7
+; WASM64-NEXT:    i8x16.shl
+; WASM64-NEXT:    i32.const 7
+; WASM64-NEXT:    i8x16.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %result
+}
+
+; Test 64-bit integer vector (2 x i64 = 128-bit)
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; WASM32-LABEL: test_ctselect_v2i64:
+; WASM32:         .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 63
+; WASM32-NEXT:    i64x2.shl
+; WASM32-NEXT:    i32.const 63
+; WASM32-NEXT:    i64x2.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v2i64:
+; WASM64:         .functype test_ctselect_v2i64 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 63
+; WASM64-NEXT:    i64x2.shl
+; WASM64-NEXT:    i32.const 63
+; WASM64-NEXT:    i64x2.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+; Test single-precision float vector (4 x float = 128-bit)
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; WASM32-LABEL: test_ctselect_v4f32:
+; WASM32:         .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4f32:
+; WASM64:         .functype test_ctselect_v4f32 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+; Test double-precision float vector (2 x double = 128-bit)
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; WASM32-LABEL: test_ctselect_v2f64:
+; WASM32:         .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 63
+; WASM32-NEXT:    i64x2.shl
+; WASM32-NEXT:    i32.const 63
+; WASM32-NEXT:    i64x2.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v2f64:
+; WASM64:         .functype test_ctselect_v2f64 (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 63
+; WASM64-NEXT:    i64x2.shl
+; WASM64-NEXT:    i32.const 63
+; WASM64-NEXT:    i64x2.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+; Test with aligned loads (common case)
+define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; WASM32-LABEL: test_ctselect_v4i32_aligned_load:
+; WASM32:         .functype test_ctselect_v4i32_aligned_load (i32, i32, i32) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    v128.load 0
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    v128.load 0
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_aligned_load:
+; WASM64:         .functype test_ctselect_v4i32_aligned_load (i32, i64, i64) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    v128.load 0
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    v128.load 0
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %a = load <4 x i32>, ptr %p1, align 16
+  %b = load <4 x i32>, ptr %p2, align 16
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with unaligned loads (stress test)
+define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; WASM32-LABEL: test_ctselect_v4i32_unaligned_load:
+; WASM32:         .functype test_ctselect_v4i32_unaligned_load (i32, i32, i32) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    v128.load 0:p2align=2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    v128.load 0:p2align=2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_unaligned_load:
+; WASM64:         .functype test_ctselect_v4i32_unaligned_load (i32, i64, i64) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    v128.load 0:p2align=2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    v128.load 0:p2align=2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %a = load <4 x i32>, ptr %p1, align 4
+  %b = load <4 x i32>, ptr %p2, align 4
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with stores to verify result handling
+define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) {
+; WASM32-LABEL: test_ctselect_v4i32_store:
+; WASM32:         .functype test_ctselect_v4i32_store (i32, v128, v128, i32) -> ()
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 3
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    v128.store 0
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_store:
+; WASM64:         .functype test_ctselect_v4i32_store (i32, v128, v128, i64) -> ()
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 3
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    v128.store 0
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  store <4 x i32> %result, ptr %out, align 16
+  ret void
+}
+
+; Test chained selects (multiple conditions)
+define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; WASM32-LABEL: test_ctselect_v4i32_chain:
+; WASM32:         .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 3
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    local.get 4
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_chain:
+; WASM64:         .functype test_ctselect_v4i32_chain (i32, i32, v128, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 3
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    local.get 4
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %tmp = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond1, <4 x i32> %a, <4 x i32> %b)
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond2, <4 x i32> %tmp, <4 x i32> %c)
+  ret <4 x i32> %result
+}
+
+; Test with arithmetic operations (ensure float vectors work with FP ops)
+define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4 x float> %y) {
+; WASM32-LABEL: test_ctselect_v4f32_arithmetic:
+; WASM32:         .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    f32x4.add
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    f32x4.sub
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4f32_arithmetic:
+; WASM64:         .functype test_ctselect_v4f32_arithmetic (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    f32x4.add
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    f32x4.sub
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %sum = fadd <4 x float> %x, %y
+  %diff = fsub <4 x float> %x, %y
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %sum, <4 x float> %diff)
+  ret <4 x float> %result
+}
+
+; Test with zero vectors
+define <4 x i32> @test_ctselect_v4i32_zeros(i1 %cond, <4 x i32> %a) {
+; WASM32-LABEL: test_ctselect_v4i32_zeros:
+; WASM32:         .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    v128.and
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_zeros:
+; WASM64:         .functype test_ctselect_v4i32_zeros (i32, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    v128.and
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond,
+                                                   <4 x i32> %a,
+                                                   <4 x i32> zeroinitializer)
+  ret <4 x i32> %result
+}
+
+; Test with function arguments directly (no loads)
+define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind {
+; WASM32-LABEL: test_ctselect_v4i32_args:
+; WASM32:         .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_args:
+; WASM64:         .functype test_ctselect_v4i32_args (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with multiple uses of result
+define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; WASM32-LABEL: test_ctselect_v4i32_multi_use:
+; WASM32:         .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i32x4.splat
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shl
+; WASM32-NEXT:    i32.const 31
+; WASM32-NEXT:    i32x4.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    local.tee 2
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    i32x4.add
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v4i32_multi_use:
+; WASM64:         .functype test_ctselect_v4i32_multi_use (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i32x4.splat
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shl
+; WASM64-NEXT:    i32.const 31
+; WASM64-NEXT:    i32x4.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    local.tee 2
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    i32x4.add
+; WASM64-NEXT:    # fallthrough-return
+  %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  %add = add <4 x i32> %sel, %sel  ; Use result twice
+  ret <4 x i32> %add
+}
+
+; Test byte vector with operations
+define <16 x i8> @test_ctselect_v16i8_ops(i1 %cond, <16 x i8> %x, <16 x i8> %y) {
+; WASM32-LABEL: test_ctselect_v16i8_ops:
+; WASM32:         .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128)
+; WASM32-NEXT:  # %bb.0:
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    v128.xor
+; WASM32-NEXT:    local.get 1
+; WASM32-NEXT:    local.get 2
+; WASM32-NEXT:    v128.and
+; WASM32-NEXT:    local.get 0
+; WASM32-NEXT:    i8x16.splat
+; WASM32-NEXT:    i32.const 7
+; WASM32-NEXT:    i8x16.shl
+; WASM32-NEXT:    i32.const 7
+; WASM32-NEXT:    i8x16.shr_s
+; WASM32-NEXT:    v128.bitselect
+; WASM32-NEXT:    # fallthrough-return
+;
+; WASM64-LABEL: test_ctselect_v16i8_ops:
+; WASM64:         .functype test_ctselect_v16i8_ops (i32, v128, v128) -> (v128)
+; WASM64-NEXT:  # %bb.0:
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    v128.xor
+; WASM64-NEXT:    local.get 1
+; WASM64-NEXT:    local.get 2
+; WASM64-NEXT:    v128.and
+; WASM64-NEXT:    local.get 0
+; WASM64-NEXT:    i8x16.splat
+; WASM64-NEXT:    i32.const 7
+; WASM64-NEXT:    i8x16.shl
+; WASM64-NEXT:    i32.const 7
+; WASM64-NEXT:    i8x16.shr_s
+; WASM64-NEXT:    v128.bitselect
+; WASM64-NEXT:    # fallthrough-return
+  %xor = xor <16 x i8> %x, %y
+  %and = and <16 x i8> %x, %y
+  %result = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %xor, <16 x i8> %and)
+  ret <16 x i8> %result
+}
+
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll
new file mode 100644
index 0000000000000..4e356f8562b39
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ctselect-fallback.ll
@@ -0,0 +1,909 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64
+
+; Test basic ct.select functionality for scalar types
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; W32-LABEL: test_ctselect_i8:
+; W32:         .functype test_ctselect_i8 (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i8:
+; W64:         .functype test_ctselect_i8 (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; W32-LABEL: test_ctselect_i16:
+; W32:         .functype test_ctselect_i16 (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i16:
+; W64:         .functype test_ctselect_i16 (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_i32:
+; W32:         .functype test_ctselect_i32 (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i32:
+; W64:         .functype test_ctselect_i32 (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; W32-LABEL: test_ctselect_i64:
+; W32:         .functype test_ctselect_i64 (i32, i64, i64) -> (i64)
+; W32-NEXT:    .local i64
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i64.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.extend_i32_u
+; W32-NEXT:    i64.const 1
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.tee 3
+; W32-NEXT:    i64.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i64.const -1
+; W32-NEXT:    i64.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_i64:
+; W64:         .functype test_ctselect_i64 (i32, i64, i64) -> (i64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; W32-LABEL: test_ctselect_ptr:
+; W32:         .functype test_ctselect_ptr (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_ptr:
+; W64:         .functype test_ctselect_ptr (i32, i64, i64) -> (i64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    # fallthrough-return
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_const_true:
+; W32:         .functype test_ctselect_const_true (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_const_true:
+; W64:         .functype test_ctselect_const_true (i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_const_false:
+; W32:         .functype test_ctselect_const_false (i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 1
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_const_false:
+; W64:         .functype test_ctselect_const_false (i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 1
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_icmp_eq:
+; W32:         .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.eq
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_icmp_eq:
+; W64:         .functype test_ctselect_icmp_eq (i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.eq
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_icmp_ne:
+; W32:         .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.ne
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_icmp_ne:
+; W64:         .functype test_ctselect_icmp_ne (i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.ne
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_icmp_slt:
+; W32:         .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.lt_s
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_icmp_slt:
+; W64:         .functype test_ctselect_icmp_slt (i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.lt_s
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; W32-LABEL: test_ctselect_icmp_ult:
+; W32:         .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.lt_u
+; W32-NEXT:    i32.select
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.xor
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_icmp_ult:
+; W64:         .functype test_ctselect_icmp_ult (i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.lt_u
+; W64-NEXT:    i32.select
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.xor
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; W32-LABEL: test_ctselect_load:
+; W32:         .functype test_ctselect_load (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.load 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.load 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_load:
+; W64:         .functype test_ctselect_load (i32, i64, i64) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.load 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.load 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; W32-LABEL: test_ctselect_nested:
+; W32:         .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_nested:
+; W64:         .functype test_ctselect_nested (i32, i32, i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Test float (32-bit)
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; W32-LABEL: test_ctselect_f32:
+; W32:         .functype test_ctselect_f32 (i32, f32, f32) -> (f32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    f32.reinterpret_i32
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f32:
+; W64:         .functype test_ctselect_f32 (i32, f32, f32) -> (f32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    f32.reinterpret_i32
+; W64-NEXT:    # fallthrough-return
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test double (64-bit)
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; W32-LABEL: test_ctselect_f64:
+; W32:         .functype test_ctselect_f64 (i32, f64, f64) -> (f64)
+; W32-NEXT:    .local i64
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i64.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.extend_i32_u
+; W32-NEXT:    i64.const 1
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.tee 3
+; W32-NEXT:    i64.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.reinterpret_f64
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i64.const -1
+; W32-NEXT:    i64.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.reinterpret_f64
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    f64.reinterpret_i64
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f64:
+; W64:         .functype test_ctselect_f64 (i32, f64, f64) -> (f64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.reinterpret_f64
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.reinterpret_f64
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    f64.reinterpret_i64
+; W64-NEXT:    # fallthrough-return
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+
+; Test chained float selects
+define float @test_ctselect_f32_chain(i1 %cond1, i1 %cond2, float %a, float %b, float %c) {
+; W32-LABEL: test_ctselect_f32_chain:
+; W32:         .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 1
+; W32-NEXT:    i32.sub
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 4
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    f32.reinterpret_i32
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f32_chain:
+; W64:         .functype test_ctselect_f32_chain (i32, i32, f32, f32, f32) -> (f32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 1
+; W64-NEXT:    i32.sub
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 4
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    f32.reinterpret_i32
+; W64-NEXT:    # fallthrough-return
+  %tmp = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %result = call float @llvm.ct.select.f32(i1 %cond2, float %tmp, float %c)
+  ret float %result
+}
+
+; Test with float load
+define float @test_ctselect_f32_load(i1 %cond, ptr %p1, ptr %p2) {
+; W32-LABEL: test_ctselect_f32_load:
+; W32:         .functype test_ctselect_f32_load (i32, i32, i32) -> (f32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.load 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.load 0
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    f32.reinterpret_i32
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f32_load:
+; W64:         .functype test_ctselect_f32_load (i32, i64, i64) -> (f32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.load 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.load 0
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    f32.reinterpret_i32
+; W64-NEXT:    # fallthrough-return
+  %a = load float, ptr %p1
+  %b = load float, ptr %p2
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test with double load
+define double @test_ctselect_f64_load(i1 %cond, ptr %p1, ptr %p2) {
+; W32-LABEL: test_ctselect_f64_load:
+; W32:         .functype test_ctselect_f64_load (i32, i32, i32) -> (f64)
+; W32-NEXT:    .local i64
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i64.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i64.extend_i32_u
+; W32-NEXT:    i64.const 1
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.tee 3
+; W32-NEXT:    i64.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    i64.and
+; W32-NEXT:    local.get 3
+; W32-NEXT:    i64.const -1
+; W32-NEXT:    i64.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i64.load 0
+; W32-NEXT:    i64.and
+; W32-NEXT:    i64.or
+; W32-NEXT:    f64.reinterpret_i64
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f64_load:
+; W64:         .functype test_ctselect_f64_load (i32, i64, i64) -> (f64)
+; W64-NEXT:    .local i64
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i64.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i64.extend_i32_u
+; W64-NEXT:    i64.const 1
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.tee 3
+; W64-NEXT:    i64.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    i64.and
+; W64-NEXT:    local.get 3
+; W64-NEXT:    i64.const -1
+; W64-NEXT:    i64.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i64.load 0
+; W64-NEXT:    i64.and
+; W64-NEXT:    i64.or
+; W64-NEXT:    f64.reinterpret_i64
+; W64-NEXT:    # fallthrough-return
+  %a = load double, ptr %p1
+  %b = load double, ptr %p2
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test mixed with arithmetic
+define float @test_ctselect_f32_arithmetic(i1 %cond, float %x, float %y) {
+; W32-LABEL: test_ctselect_f32_arithmetic:
+; W32:         .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    f32.add
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    f32.sub
+; W32-NEXT:    i32.reinterpret_f32
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    f32.reinterpret_i32
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_ctselect_f32_arithmetic:
+; W64:         .functype test_ctselect_f32_arithmetic (i32, f32, f32) -> (f32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    f32.add
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    f32.sub
+; W64-NEXT:    i32.reinterpret_f32
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    f32.reinterpret_i32
+; W64-NEXT:    # fallthrough-return
+  %sum = fadd float %x, %y
+  %diff = fsub float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %sum, float %diff)
+  ret float %result
+}
+
+; Declare the intrinsics
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll
new file mode 100644
index 0000000000000..5b20e892c64d2
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ctselect-side-effects.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W32
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -O3 -filetype=asm | FileCheck %s --check-prefix=W64
+
+; Test 1: Basic optimizations should still work
+define i32 @test_basic_opts(i32 %x) {
+; W32-LABEL: test_basic_opts:
+; W32:         .functype test_basic_opts (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_basic_opts:
+; W64:         .functype test_basic_opts (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    # fallthrough-return
+  %a = or i32 %x, 0      ; Should eliminate
+  %b = and i32 %a, -1    ; Should eliminate
+  %c = xor i32 %b, 0     ; Should eliminate
+  ret i32 %c
+}
+
+; Test 2: Constant folding should work
+define i32 @test_constant_fold() {
+; W32-LABEL: test_constant_fold:
+; W32:         .functype test_constant_fold () -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_constant_fold:
+; W64:         .functype test_constant_fold () -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    # fallthrough-return
+  %a = xor i32 -1, -1    ; Should fold to 0
+  ret i32 %a
+}
+
+; Test 3: Protected pattern should NOT have branches
+define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) {
+; W32-LABEL: test_protected_no_branch:
+; W32:         .functype test_protected_no_branch (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.tee 0
+; W32-NEXT:    i32.sub
+; W32-NEXT:    local.get 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const -1
+; W32-NEXT:    i32.add
+; W32-NEXT:    local.get 2
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.or
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_protected_no_branch:
+; W64:         .functype test_protected_no_branch (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.tee 0
+; W64-NEXT:    i32.sub
+; W64-NEXT:    local.get 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const -1
+; W64-NEXT:    i32.add
+; W64-NEXT:    local.get 2
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.or
+; W64-NEXT:    # fallthrough-return
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test 4: Explicit branch should still generate branches
+define i32 @test_explicit_branch(i1 %cond, i32 %a, i32 %b) {
+; W32-LABEL: test_explicit_branch:
+; W32:         .functype test_explicit_branch (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    block
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.eqz
+; W32-NEXT:    br_if 0 # 0: down to label0
+; W32-NEXT:  # %bb.1: # %true
+; W32-NEXT:    local.get 1
+; W32-NEXT:    return
+; W32-NEXT:  .LBB3_2: # %false
+; W32-NEXT:    end_block # label0:
+; W32-NEXT:    local.get 2
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_explicit_branch:
+; W64:         .functype test_explicit_branch (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    block
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.eqz
+; W64-NEXT:    br_if 0 # 0: down to label0
+; W64-NEXT:  # %bb.1: # %true
+; W64-NEXT:    local.get 1
+; W64-NEXT:    return
+; W64-NEXT:  .LBB3_2: # %false
+; W64-NEXT:    end_block # label0:
+; W64-NEXT:    local.get 2
+; W64-NEXT:    # fallthrough-return
+  br i1 %cond, label %true, label %false
+true:
+  ret i32 %a
+false:
+  ret i32 %b
+}
+
+; Test 5: Regular select (not ct.select) - whatever wasm wants to do
+define i32 @test_regular_select(i1 %cond, i32 %a, i32 %b) {
+; W32-LABEL: test_regular_select:
+; W32:         .functype test_regular_select (i32, i32, i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 1
+; W32-NEXT:    local.get 2
+; W32-NEXT:    local.get 0
+; W32-NEXT:    i32.const 1
+; W32-NEXT:    i32.and
+; W32-NEXT:    i32.select
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_regular_select:
+; W64:         .functype test_regular_select (i32, i32, i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 1
+; W64-NEXT:    local.get 2
+; W64-NEXT:    local.get 0
+; W64-NEXT:    i32.const 1
+; W64-NEXT:    i32.and
+; W64-NEXT:    i32.select
+; W64-NEXT:    # fallthrough-return
+  %result = select i1 %cond, i32 %a, i32 %b
+  ret i32 %result
+}
+
+; Test if XOR with all-ones still gets optimized
+define i32 @test_xor_all_ones() {
+; W32-LABEL: test_xor_all_ones:
+; W32:         .functype test_xor_all_ones () -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_xor_all_ones:
+; W64:         .functype test_xor_all_ones () -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    # fallthrough-return
+  %xor1 = xor i32 -1, -1  ; Should optimize to 0
+  ret i32 %xor1
+}
+
+define i32 @test_xor_same_value(i32 %x) {
+; W32-LABEL: test_xor_same_value:
+; W32:         .functype test_xor_same_value (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_xor_same_value:
+; W64:         .functype test_xor_same_value (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    # fallthrough-return
+  %xor2 = xor i32 %x, %x  ; Should optimize to 0
+  ret i32 %xor2
+}
+
+define i32 @test_normal_ops(i32 %x) {
+; W32-LABEL: test_normal_ops:
+; W32:         .functype test_normal_ops (i32) -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    local.get 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_normal_ops:
+; W64:         .functype test_normal_ops (i32) -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    local.get 0
+; W64-NEXT:    # fallthrough-return
+  %or1 = or i32 %x, 0
+  %and1 = and i32 %or1, -1
+  %xor1 = xor i32 %and1, 0
+  ret i32 %xor1
+}
+
+; This simulates what the reviewer is worried about
+define i32 @test_xor_with_const_operands() {
+; W32-LABEL: test_xor_with_const_operands:
+; W32:         .functype test_xor_with_const_operands () -> (i32)
+; W32-NEXT:  # %bb.0:
+; W32-NEXT:    i32.const 0
+; W32-NEXT:    # fallthrough-return
+;
+; W64-LABEL: test_xor_with_const_operands:
+; W64:         .functype test_xor_with_const_operands () -> (i32)
+; W64-NEXT:  # %bb.0:
+; W64-NEXT:    i32.const 0
+; W64-NEXT:    # fallthrough-return
+  %a = xor i32 -1, -1
+  %b = xor i32 0, 0
+  %c = xor i32 42, 42
+  %result = or i32 %a, %b
+  %final = or i32 %result, %c
+  ret i32 %final  ; Should optimize to 0
+}
+
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    cmovneq %rdx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edx, 8(%eax)
+; X32-NEXT:    movl %edi, 4(%eax)
+; X32-NEXT:    movl %esi, (%eax)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+  %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+  ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    retl
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32:       # %bb.0:
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    subl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 36
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    sete %al
+; X32-NEXT:    fxch %st(1)
+; X32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X32-NEXT:    fstpl (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load volatile i32, ptr %p1
+  %b = load volatile i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load atomic i32, ptr %p1 acquire, align 4
+  %b = load atomic i32, ptr %p2 acquire, align 4
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64:       # %bb.0:
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %r8d, %r9d
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %r9d, %r11d
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    cmovnel %r11d, %r10d
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    cmovnel %r10d, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %esi, %edx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %edx, %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
+; X32-NEXT:    retl
+  %a = load i32, ptr %p1, align 1
+  %b = load i32, ptr %p2, align 1
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..ea943307c644f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    setnp %al
+; I386-NOCMOV-NEXT:    sete %cl
+; I386-NOCMOV-NEXT:    testb %al, %cl
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    setnp %al
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %al, %cl
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldl (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldl (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    seta %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    seta %al
+; I386-CMOV-NEXT:    testb %al, %al
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp ogt float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT:    fldz
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    fxch %st(1)
+; I386-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstps (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl (%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT:    fldz
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    fxch %st(1)
+; I386-CMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstps (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl (%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %nan = bitcast i32 2139095040 to float  ; 0x7F800000 = +inf
+  %zero = bitcast i32 0 to float
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+  ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Stress test: multiple CTSELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+  ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..2cb67ba9c29b5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,418 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %eax, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    paddd %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    paddd %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %eax, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    psllw %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    psllw %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %ecx
+; I386-NOCMOV-NEXT:    negl %ecx
+; I386-NOCMOV-NEXT:    movl %esi, %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    notl %ecx
+; I386-NOCMOV-NEXT:    andl %eax, %ecx
+; I386-NOCMOV-NEXT:    orl %ecx, %ebp
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %cl
+; I386-NOCMOV-NEXT:    testb %cl, %cl
+; I386-NOCMOV-NEXT:    sete %cl
+; I386-NOCMOV-NEXT:    movb %cl, %ch
+; I386-NOCMOV-NEXT:    movzbl %ch, %ebx
+; I386-NOCMOV-NEXT:    negl %ebx
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %ebx, %esi
+; I386-NOCMOV-NEXT:    notl %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    orl %ebx, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movb %cl, %ch
+; I386-NOCMOV-NEXT:    movzbl %ch, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %eax, %edx
+; I386-NOCMOV-NEXT:    andl %esi, %edx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %edx
+; I386-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    pand %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT:    .cfi_offset %esi, -12
+; I386-CMOV-NEXT:    .cfi_offset %ebx, -8
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bl
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bh
+; I386-CMOV-NEXT:    testb %bh, %bh
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    testb %bl, %bl
+; I386-CMOV-NEXT:    cmovnel %esi, %edx
+; I386-CMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    pand %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    popl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %mmx_c = bitcast i64 %c to <1 x i64>
+  %cmp1 = icmp ne i32 %cond1, 0
+  %cmp2 = icmp ne i32 %cond2, 0
+  %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+  %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %eax, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    por %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    por %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d7345f1121540
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbw %bh, %si
+; I386-NOCMOV-NEXT:    negw %si
+; I386-NOCMOV-NEXT:    movw %dx, %ax
+; I386-NOCMOV-NEXT:    andw %si, %ax
+; I386-NOCMOV-NEXT:    notw %si
+; I386-NOCMOV-NEXT:    andw %cx, %si
+; I386-NOCMOV-NEXT:    orw %si, %ax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %ah
+; I386-NOCMOV-NEXT:    movb %ah, %ch
+; I386-NOCMOV-NEXT:    negb %ch
+; I386-NOCMOV-NEXT:    movb %dl, %al
+; I386-NOCMOV-NEXT:    andb %ch, %al
+; I386-NOCMOV-NEXT:    notb %ch
+; I386-NOCMOV-NEXT:    andb %cl, %ch
+; I386-NOCMOV-NEXT:    orb %ch, %al
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret_bit, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+  ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+  ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp eq i32 %a, %c
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+  ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %eax, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %dl
+; I386-NOCMOV-NEXT:    movb %dl, %dh
+; I386-NOCMOV-NEXT:    movzbl %dh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:    andl %edi, %eax
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %esi, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+  ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    negl %ecx
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    sete %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rdi, %rax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt float %x, 0.0
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+  ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %xmm0, %rax
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rax, %rdx
+; CHECK-NEXT:    movq %rdx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt double %x, 0.0
+  %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+  ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %ecx, %r8d
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovnel %r8d, %r9d
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovnel %r9d, %eax
+; CHECK-NEXT:    retq
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB0_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB0_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB1_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB1_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB2_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB2_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB3_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %xmm0, %xmm1
+; AVX512-NEXT:  .LBB3_2:
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB4_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB4_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB5_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB5_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+  ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB6_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB6_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB7_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %ymm0, %ymm1
+; AVX512-NEXT:  .LBB7_2:
+; AVX512-NEXT:    vmovapd %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+  ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB8_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB8_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+  ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB9_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB9_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+  ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB10_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB10_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+  ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB11_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %zmm0, %zmm1
+; AVX512-NEXT:  .LBB11_2:
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+  ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb $1, %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    cmpl %esi, %edi
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX:       # %bb.0:
+; AVX-NEXT:    cmpl %esi, %edi
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    cmpl %esi, %edi
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    cmpl %esi, %edi
+; AVX512-NEXT:    je .LBB14_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:  .LBB14_2:
+; AVX512-NEXT:    retq
+  %cond = icmp eq i32 %x, %y
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
new file mode 100644
index 0000000000000..3f6276add0a5c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -0,0 +1,946 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=X32-NOCMOV
+
+; Test basic ct.select functionality for scalar types
+
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; X64-LABEL: test_ctselect_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i8:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i8:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %ah
+; X32-NOCMOV-NEXT:    movb %ah, %ch
+; X32-NOCMOV-NEXT:    negb %ch
+; X32-NOCMOV-NEXT:    movb %dl, %al
+; X32-NOCMOV-NEXT:    andb %ch, %al
+; X32-NOCMOV-NEXT:    notb %ch
+; X32-NOCMOV-NEXT:    andb %cl, %ch
+; X32-NOCMOV-NEXT:    orb %ch, %al
+; X32-NOCMOV-NEXT:    retl
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; X64-LABEL: test_ctselect_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i16:
+; X32:       # %bb.0:
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i16:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbw %bh, %si
+; X32-NOCMOV-NEXT:    negw %si
+; X32-NOCMOV-NEXT:    movw %dx, %ax
+; X32-NOCMOV-NEXT:    andw %si, %ax
+; X32-NOCMOV-NEXT:    notw %si
+; X32-NOCMOV-NEXT:    andw %cx, %si
+; X32-NOCMOV-NEXT:    orw %si, %ax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; X64-LABEL: test_ctselect_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i64:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i64:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %ebp
+; X32-NOCMOV-NEXT:    negl %ebp
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %ebp, %eax
+; X32-NOCMOV-NEXT:    notl %ebp
+; X32-NOCMOV-NEXT:    andl %ecx, %ebp
+; X32-NOCMOV-NEXT:    orl %ebp, %eax
+; X32-NOCMOV-NEXT:    movb %bl, %cl
+; X32-NOCMOV-NEXT:    movzbl %cl, %ebp
+; X32-NOCMOV-NEXT:    negl %ebp
+; X32-NOCMOV-NEXT:    movl %edi, %edx
+; X32-NOCMOV-NEXT:    andl %ebp, %edx
+; X32-NOCMOV-NEXT:    notl %ebp
+; X32-NOCMOV-NEXT:    andl %esi, %ebp
+; X32-NOCMOV-NEXT:    orl %ebp, %edx
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; X64-LABEL: test_ctselect_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; X64-LABEL: test_ctselect_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    fldl (%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f64:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    subl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fldl (%esp)
+; X32-NOCMOV-NEXT:    addl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_ptr:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_const_true:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movb $1, %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_const_true:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb $1, %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_const_true:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb $1, %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_const_false:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_const_false:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_const_false:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_eq:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ne:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setne %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_ne:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setne %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setne %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_slt:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setl %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_slt:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setl %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setl %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ult:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_ult:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setb %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+; X64-LABEL: test_ctselect_fcmp_oeq:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setnp %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb %dl, %sil
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_fcmp_oeq:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fucompi %st(1), %st
+; X32-NEXT:    fstp %st(0)
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %al, %cl
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fucompp
+; X32-NOCMOV-NEXT:    fnstsw %ax
+; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; X32-NOCMOV-NEXT:    sahf
+; X32-NOCMOV-NEXT:    setnp %al
+; X32-NOCMOV-NEXT:    sete %cl
+; X32-NOCMOV-NEXT:    testb %al, %cl
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_load:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+; X32-NOCMOV-NEXT:    movl (%eax), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; X64-LABEL: test_ctselect_nested:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %edx, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %eax, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %dl
+; X32-NOCMOV-NEXT:    movb %dl, %dh
+; X32-NOCMOV-NEXT:    movzbl %dh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %ecx, %eax
+; X32-NOCMOV-NEXT:    andl %edi, %eax
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %esi, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)